@siteed/expo-audio-stream 1.17.0 → 2.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +26 -1
- package/README.md +1 -1
- package/android/src/main/java/net/siteed/audiostream/AudioAnalysisData.kt +68 -22
- package/android/src/main/java/net/siteed/audiostream/AudioFormatUtils.kt +24 -0
- package/android/src/main/java/net/siteed/audiostream/AudioProcessor.kt +836 -386
- package/android/src/main/java/net/siteed/audiostream/AudioRecorderManager.kt +0 -2
- package/android/src/main/java/net/siteed/audiostream/AudioRecordingService.kt +35 -29
- package/android/src/main/java/net/siteed/audiostream/ExpoAudioStreamModule.kt +236 -96
- package/android/src/main/java/net/siteed/audiostream/FFT.kt +55 -0
- package/android/src/main/java/net/siteed/audiostream/Features.kt +49 -7
- package/android/src/main/java/net/siteed/audiostream/RecordingConfig.kt +2 -4
- package/build/AudioAnalysis/AudioAnalysis.types.d.ts +55 -47
- package/build/AudioAnalysis/AudioAnalysis.types.d.ts.map +1 -1
- package/build/AudioAnalysis/AudioAnalysis.types.js.map +1 -1
- package/build/AudioAnalysis/extractAudioAnalysis.d.ts +60 -13
- package/build/AudioAnalysis/extractAudioAnalysis.d.ts.map +1 -1
- package/build/AudioAnalysis/extractAudioAnalysis.js +147 -162
- package/build/AudioAnalysis/extractAudioAnalysis.js.map +1 -1
- package/build/ExpoAudioStream.types.d.ts +47 -3
- package/build/ExpoAudioStream.types.d.ts.map +1 -1
- package/build/ExpoAudioStream.types.js.map +1 -1
- package/build/ExpoAudioStream.web.d.ts.map +1 -1
- package/build/ExpoAudioStream.web.js +0 -1
- package/build/ExpoAudioStream.web.js.map +1 -1
- package/build/ExpoAudioStreamModule.d.ts.map +1 -1
- package/build/ExpoAudioStreamModule.js +216 -12
- package/build/ExpoAudioStreamModule.js.map +1 -1
- package/build/WebRecorder.web.d.ts +67 -13
- package/build/WebRecorder.web.d.ts.map +1 -1
- package/build/WebRecorder.web.js +177 -173
- package/build/WebRecorder.web.js.map +1 -1
- package/build/index.d.ts +3 -3
- package/build/index.d.ts.map +1 -1
- package/build/index.js +2 -2
- package/build/index.js.map +1 -1
- package/build/useAudioRecorder.d.ts.map +1 -1
- package/build/useAudioRecorder.js +12 -8
- package/build/useAudioRecorder.js.map +1 -1
- package/build/utils/audioProcessing.d.ts +24 -0
- package/build/utils/audioProcessing.d.ts.map +1 -0
- package/build/utils/audioProcessing.js +133 -0
- package/build/utils/audioProcessing.js.map +1 -0
- package/build/workers/InlineFeaturesExtractor.web.d.ts +1 -1
- package/build/workers/InlineFeaturesExtractor.web.d.ts.map +1 -1
- package/build/workers/InlineFeaturesExtractor.web.js +694 -194
- package/build/workers/InlineFeaturesExtractor.web.js.map +1 -1
- package/build/workers/inlineAudioWebWorker.web.d.ts +1 -1
- package/build/workers/inlineAudioWebWorker.web.d.ts.map +1 -1
- package/build/workers/inlineAudioWebWorker.web.js +3 -2
- package/build/workers/inlineAudioWebWorker.web.js.map +1 -1
- package/ios/AudioAnalysisData.swift +51 -16
- package/ios/AudioProcessingHelpers.swift +710 -26
- package/ios/AudioProcessor.swift +334 -185
- package/ios/AudioStreamManager.swift +2 -3
- package/ios/DataPoint.swift +25 -12
- package/ios/DecodingConfig.swift +47 -0
- package/ios/ExpoAudioStreamModule.swift +187 -103
- package/ios/FFT.swift +62 -0
- package/ios/Features.swift +24 -3
- package/ios/RecordingSettings.swift +7 -7
- package/package.json +2 -1
- package/plugin/build/index.js +6 -1
- package/plugin/src/index.ts +9 -1
- package/src/AudioAnalysis/AudioAnalysis.types.ts +68 -52
- package/src/AudioAnalysis/extractAudioAnalysis.ts +223 -219
- package/src/ExpoAudioStream.types.ts +53 -7
- package/src/ExpoAudioStream.web.ts +0 -1
- package/src/ExpoAudioStreamModule.ts +255 -10
- package/src/WebRecorder.web.ts +231 -244
- package/src/index.ts +5 -3
- package/src/useAudioRecorder.tsx +14 -10
- package/src/utils/audioProcessing.ts +205 -0
- package/src/workers/InlineFeaturesExtractor.web.tsx +694 -194
- package/src/workers/inlineAudioWebWorker.web.tsx +3 -2
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
// net/siteed/audiostream/AudioProcessor.kt
|
|
1
|
+
// packages/expo-audio-stream/android/src/main/java/net/siteed/audiostream/AudioProcessor.kt
|
|
2
2
|
package net.siteed.audiostream
|
|
3
3
|
|
|
4
4
|
import java.nio.ByteBuffer
|
|
@@ -6,15 +6,14 @@ import java.nio.ByteOrder
|
|
|
6
6
|
import kotlin.math.*
|
|
7
7
|
import android.util.Log
|
|
8
8
|
import java.io.File
|
|
9
|
-
import java.io.IOException
|
|
10
9
|
import java.util.concurrent.atomic.AtomicLong
|
|
11
10
|
import kotlin.system.measureTimeMillis
|
|
12
11
|
import android.media.MediaExtractor
|
|
13
12
|
import android.media.MediaFormat
|
|
14
13
|
import android.media.MediaCodec
|
|
15
14
|
import java.io.FileInputStream
|
|
16
|
-
import java.nio.channels.FileChannel
|
|
17
15
|
import java.io.RandomAccessFile
|
|
16
|
+
import java.util.zip.CRC32
|
|
18
17
|
|
|
19
18
|
data class DecodingConfig(
|
|
20
19
|
val targetSampleRate: Int? = null, // Optional target sample rate
|
|
@@ -25,13 +24,9 @@ data class DecodingConfig(
|
|
|
25
24
|
|
|
26
25
|
class AudioProcessor(private val filesDir: File) {
|
|
27
26
|
companion object {
|
|
28
|
-
const val NUM_MFCC_COEFFICIENTS = 13
|
|
29
|
-
const val NUM_MEL_FILTERS = 26
|
|
30
|
-
const val MEL_MIN_FREQ = 0.0
|
|
31
|
-
const val MEL_MAX_FREQ_DIVISOR = 2595.0
|
|
32
|
-
const val MEL_MAX_FREQ_CONSTANT = 700.0
|
|
33
27
|
const val DCT_SQRT_DIVISOR = 2.0
|
|
34
|
-
const val
|
|
28
|
+
private const val N_FFT = 1024
|
|
29
|
+
private const val N_CHROMA = 12
|
|
35
30
|
|
|
36
31
|
private val uniqueIdCounter = AtomicLong(0L) // Keep as companion object property to maintain during pause/resume cycles
|
|
37
32
|
|
|
@@ -45,7 +40,7 @@ class AudioProcessor(private val filesDir: File) {
|
|
|
45
40
|
private var cumulativeMinAmplitude = Float.MAX_VALUE
|
|
46
41
|
private var cumulativeMaxAmplitude = Float.NEGATIVE_INFINITY
|
|
47
42
|
|
|
48
|
-
fun loadAudioFile(filePath: String
|
|
43
|
+
private fun loadAudioFile(filePath: String): AudioData? {
|
|
49
44
|
try {
|
|
50
45
|
val fileUri = filePath.removePrefix("file://")
|
|
51
46
|
Log.d("AudioProcessor", "Processing WAV file: $fileUri")
|
|
@@ -66,10 +61,6 @@ class AudioProcessor(private val filesDir: File) {
|
|
|
66
61
|
return null
|
|
67
62
|
}
|
|
68
63
|
|
|
69
|
-
// Read file size (4 bytes little-endian)
|
|
70
|
-
val fileSizeBytes = ByteArray(4).apply { raf.readFully(this) }
|
|
71
|
-
val expectedFileSize = ByteBuffer.wrap(fileSizeBytes).order(ByteOrder.LITTLE_ENDIAN).int + 8L
|
|
72
|
-
|
|
73
64
|
// Read WAVE header
|
|
74
65
|
val waveHeader = ByteArray(4).apply { raf.readFully(this) }
|
|
75
66
|
if (String(waveHeader) != "WAVE") {
|
|
@@ -180,18 +171,6 @@ class AudioProcessor(private val filesDir: File) {
|
|
|
180
171
|
}
|
|
181
172
|
}
|
|
182
173
|
|
|
183
|
-
private fun byteArrayToInt(bytes: ByteArray): Int {
|
|
184
|
-
return (bytes[0].toInt() and 0xFF) or
|
|
185
|
-
((bytes[1].toInt() and 0xFF) shl 8) or
|
|
186
|
-
((bytes[2].toInt() and 0xFF) shl 16) or
|
|
187
|
-
((bytes[3].toInt() and 0xFF) shl 24)
|
|
188
|
-
}
|
|
189
|
-
|
|
190
|
-
private fun byteArrayToShort(bytes: ByteArray): Short {
|
|
191
|
-
return (bytes[0].toInt() and 0xFF or
|
|
192
|
-
(bytes[1].toInt() and 0xFF shl 8)).toShort()
|
|
193
|
-
}
|
|
194
|
-
|
|
195
174
|
/**
|
|
196
175
|
* Processes the audio data and extracts features.
|
|
197
176
|
* @param data The audio data in bytes.
|
|
@@ -199,6 +178,22 @@ class AudioProcessor(private val filesDir: File) {
|
|
|
199
178
|
* @return AudioAnalysisData containing the extracted features.
|
|
200
179
|
*/
|
|
201
180
|
fun processAudioData(data: ByteArray, config: RecordingConfig): AudioAnalysisData {
|
|
181
|
+
if (data.isEmpty()) {
|
|
182
|
+
Log.e("AudioProcessor", "Received empty audio data")
|
|
183
|
+
return AudioAnalysisData(
|
|
184
|
+
segmentDurationMs = config.segmentDurationMs,
|
|
185
|
+
durationMs = 0,
|
|
186
|
+
bitDepth = 16,
|
|
187
|
+
numberOfChannels = config.channels,
|
|
188
|
+
sampleRate = config.sampleRate,
|
|
189
|
+
samples = 0,
|
|
190
|
+
dataPoints = emptyList(),
|
|
191
|
+
amplitudeRange = AudioAnalysisData.AmplitudeRange(0f, 0f),
|
|
192
|
+
rmsRange = AudioAnalysisData.AmplitudeRange(0f, 0f),
|
|
193
|
+
extractionTimeMs = 0f,
|
|
194
|
+
)
|
|
195
|
+
}
|
|
196
|
+
|
|
202
197
|
val sampleRate = config.sampleRate.toFloat()
|
|
203
198
|
val bitDepth = when (config.encoding) {
|
|
204
199
|
"pcm_8bit" -> 8
|
|
@@ -207,34 +202,33 @@ class AudioProcessor(private val filesDir: File) {
|
|
|
207
202
|
else -> throw IllegalArgumentException("Unsupported encoding: ${config.encoding}")
|
|
208
203
|
}
|
|
209
204
|
val channelData = convertToFloatArray(data, bitDepth)
|
|
210
|
-
val pointsPerSecond = config.pointsPerSecond
|
|
211
|
-
val algorithm = config.algorithm
|
|
212
205
|
val featureOptions = config.features
|
|
213
206
|
|
|
214
207
|
val totalSamples = channelData.size
|
|
215
|
-
|
|
216
|
-
val
|
|
217
|
-
val
|
|
218
|
-
|
|
219
|
-
Log.d("AudioProcessor", "Extracting waveform totalSize=${data.size} with $totalSamples samples
|
|
220
|
-
Log.d("AudioProcessor", "segmentDuration: $
|
|
208
|
+
// Update samplesPerSegment calculation to use proper formula
|
|
209
|
+
val samplesPerSegment = ((config.segmentDurationMs / 1000.0) * sampleRate).toInt()
|
|
210
|
+
val totalPoints = ceil(totalSamples.toDouble() / samplesPerSegment).toInt()
|
|
211
|
+
|
|
212
|
+
Log.d("AudioProcessor", "Extracting waveform totalSize=${data.size} with $totalSamples samples --> $totalPoints points")
|
|
213
|
+
Log.d("AudioProcessor", "segmentDuration: ${config.segmentDurationMs}ms, samplesPerSegment: $samplesPerSegment")
|
|
221
214
|
|
|
222
|
-
|
|
223
|
-
val samplesPerPoint = ceil(channelData.size /
|
|
224
|
-
Log.d("AudioProcessor", "Extracting waveform with
|
|
215
|
+
// Remove expectedPoints calculation since it used pointsPerSecond
|
|
216
|
+
val samplesPerPoint = ceil(channelData.size / totalPoints.toDouble()).toInt()
|
|
217
|
+
Log.d("AudioProcessor", "Extracting waveform with samplesPerPoints=$samplesPerPoint")
|
|
225
218
|
|
|
226
219
|
val dataPoints = mutableListOf<DataPoint>()
|
|
227
220
|
var minAmplitude = Float.MAX_VALUE
|
|
228
221
|
var maxAmplitude = Float.NEGATIVE_INFINITY
|
|
229
|
-
|
|
222
|
+
var minRms = Float.MAX_VALUE
|
|
223
|
+
var maxRms = Float.NEGATIVE_INFINITY
|
|
224
|
+
// Calculate total duration in milliseconds based on sample rate and total samples
|
|
225
|
+
val durationMs = (totalSamples.toFloat() / sampleRate * 1000).toInt()
|
|
230
226
|
|
|
231
227
|
// Measure the time taken for audio processing
|
|
232
228
|
val extractionTimeMs = measureTimeMillis {
|
|
233
|
-
var currentPosition = 0 // Track the current byte position
|
|
234
|
-
|
|
235
229
|
for (i in 0 until totalPoints) {
|
|
236
|
-
val start = i *
|
|
237
|
-
val end = min(start +
|
|
230
|
+
val start = i * samplesPerSegment
|
|
231
|
+
val end = min(start + samplesPerSegment, totalSamples)
|
|
238
232
|
val segmentData = channelData.sliceArray(start until end)
|
|
239
233
|
|
|
240
234
|
var sumSquares = 0f
|
|
@@ -253,12 +247,23 @@ class AudioProcessor(private val filesDir: File) {
|
|
|
253
247
|
localMaxAmplitude = max(localMaxAmplitude, absValue)
|
|
254
248
|
}
|
|
255
249
|
|
|
256
|
-
val features = computeFeatures(
|
|
250
|
+
val features = computeFeatures(
|
|
251
|
+
segmentData = segmentData,
|
|
252
|
+
sampleRate = sampleRate,
|
|
253
|
+
sumSquares = sumSquares,
|
|
254
|
+
zeroCrossings = zeroCrossings,
|
|
255
|
+
segmentLength = segmentData.size,
|
|
256
|
+
featureOptions = featureOptions,
|
|
257
|
+
minAmplitude = localMinAmplitude,
|
|
258
|
+
maxAmplitude = localMaxAmplitude
|
|
259
|
+
)
|
|
257
260
|
val rms = features.rms
|
|
258
261
|
val silent = rms < 0.01
|
|
259
|
-
val dB =
|
|
262
|
+
val dB = 20 * log10(rms.toDouble()).toFloat()
|
|
260
263
|
minAmplitude = min(minAmplitude, localMinAmplitude)
|
|
261
264
|
maxAmplitude = max(maxAmplitude, localMaxAmplitude)
|
|
265
|
+
minRms = min(minRms, rms)
|
|
266
|
+
maxRms = max(maxRms, rms)
|
|
262
267
|
|
|
263
268
|
val bytesPerSample = bitDepth / 8
|
|
264
269
|
val startPosition = start * bytesPerSample * config.channels
|
|
@@ -269,18 +274,18 @@ class AudioProcessor(private val filesDir: File) {
|
|
|
269
274
|
cumulativeMaxAmplitude = max(cumulativeMaxAmplitude, localMaxAmplitude)
|
|
270
275
|
|
|
271
276
|
val dataPoint = DataPoint(
|
|
272
|
-
id = uniqueIdCounter.getAndIncrement(),
|
|
273
|
-
amplitude =
|
|
274
|
-
|
|
277
|
+
id = uniqueIdCounter.getAndIncrement(),
|
|
278
|
+
amplitude = localMaxAmplitude, // Always use peak amplitude
|
|
279
|
+
rms = rms, // Always include RMS
|
|
275
280
|
dB = dB,
|
|
276
281
|
silent = silent,
|
|
277
282
|
features = features,
|
|
278
|
-
|
|
283
|
+
speech = SpeechFeatures(isActive = !silent),
|
|
279
284
|
startTime = startPosition / (sampleRate * bytesPerSample * config.channels),
|
|
280
285
|
endTime = endPosition / (sampleRate * bytesPerSample * config.channels),
|
|
281
286
|
startPosition = startPosition,
|
|
282
287
|
endPosition = endPosition,
|
|
283
|
-
|
|
288
|
+
samples = segmentData.size
|
|
284
289
|
)
|
|
285
290
|
|
|
286
291
|
dataPoints.add(dataPoint)
|
|
@@ -288,16 +293,16 @@ class AudioProcessor(private val filesDir: File) {
|
|
|
288
293
|
}
|
|
289
294
|
|
|
290
295
|
return AudioAnalysisData(
|
|
291
|
-
|
|
296
|
+
segmentDurationMs = config.segmentDurationMs,
|
|
292
297
|
durationMs = durationMs,
|
|
293
298
|
bitDepth = bitDepth,
|
|
294
299
|
numberOfChannels = config.channels,
|
|
295
|
-
sampleRate = config.sampleRate,
|
|
296
|
-
samples = totalSamples,
|
|
300
|
+
sampleRate = config.sampleRate, // Use config.sampleRate instead of sampleRate
|
|
301
|
+
samples = totalSamples, // Use totalSamples instead of samplesInRange
|
|
297
302
|
dataPoints = dataPoints,
|
|
298
|
-
amplitudeRange = AudioAnalysisData.AmplitudeRange(
|
|
299
|
-
|
|
300
|
-
extractionTimeMs = extractionTimeMs.toFloat()
|
|
303
|
+
amplitudeRange = AudioAnalysisData.AmplitudeRange(minAmplitude, maxAmplitude),
|
|
304
|
+
rmsRange = AudioAnalysisData.AmplitudeRange(minRms, maxRms),
|
|
305
|
+
extractionTimeMs = extractionTimeMs.toFloat()
|
|
301
306
|
)
|
|
302
307
|
}
|
|
303
308
|
|
|
@@ -358,45 +363,33 @@ class AudioProcessor(private val filesDir: File) {
|
|
|
358
363
|
val zcr = if (featureOptions["zcr"] == true) zeroCrossings / segmentLength.toFloat() else 0f
|
|
359
364
|
|
|
360
365
|
val mfcc = try {
|
|
361
|
-
if (featureOptions["mfcc"] == true)
|
|
366
|
+
if (featureOptions["mfcc"] == true) computeMFCC(segmentData, sampleRate) else emptyList()
|
|
362
367
|
} catch (e: Exception) {
|
|
363
368
|
Log.e("AudioProcessor", "Failed to extract MFCC: ${e.message}", e)
|
|
364
369
|
emptyList()
|
|
365
370
|
}
|
|
366
371
|
|
|
367
|
-
val
|
|
368
|
-
if (featureOptions["
|
|
369
|
-
} catch (e: Exception) {
|
|
370
|
-
Log.e("AudioProcessor", "Failed to extract spectral centroid: ${e.message}", e)
|
|
371
|
-
0f
|
|
372
|
-
}
|
|
373
|
-
|
|
374
|
-
val spectralFlatness = try {
|
|
375
|
-
if (featureOptions["spectralFlatness"] == true) extractSpectralFlatness(segmentData) else 0f
|
|
376
|
-
} catch (e: Exception) {
|
|
377
|
-
Log.e("AudioProcessor", "Failed to extract spectral flatness: ${e.message}", e)
|
|
378
|
-
0f
|
|
379
|
-
}
|
|
380
|
-
|
|
381
|
-
val spectralRollOff = try {
|
|
382
|
-
if (featureOptions["spectralRollOff"] == true) extractSpectralRollOff(segmentData, sampleRate) else 0f
|
|
372
|
+
val melSpectrogram = try {
|
|
373
|
+
if (featureOptions["melSpectrogram"] == true) computeMelSpectrogram(segmentData, sampleRate) else emptyList()
|
|
383
374
|
} catch (e: Exception) {
|
|
384
|
-
Log.e("AudioProcessor", "Failed to
|
|
385
|
-
|
|
375
|
+
Log.e("AudioProcessor", "Failed to compute mel spectrogram: ${e.message}", e)
|
|
376
|
+
emptyList()
|
|
386
377
|
}
|
|
387
378
|
|
|
388
|
-
val
|
|
389
|
-
if (featureOptions["
|
|
379
|
+
val chroma = try {
|
|
380
|
+
if (featureOptions["chromagram"] == true) computeChroma(segmentData, sampleRate) else emptyList()
|
|
390
381
|
} catch (e: Exception) {
|
|
391
|
-
Log.e("AudioProcessor", "Failed to
|
|
392
|
-
|
|
382
|
+
Log.e("AudioProcessor", "Failed to compute chroma: ${e.message}", e)
|
|
383
|
+
emptyList()
|
|
393
384
|
}
|
|
394
385
|
|
|
395
|
-
val
|
|
396
|
-
|
|
397
|
-
|
|
398
|
-
|
|
399
|
-
|
|
386
|
+
val spectralFeatures = if (featureOptions["spectralCentroid"] == true ||
|
|
387
|
+
featureOptions["spectralFlatness"] == true ||
|
|
388
|
+
featureOptions["spectralRollOff"] == true ||
|
|
389
|
+
featureOptions["spectralBandwidth"] == true) {
|
|
390
|
+
extractSpectralFeatures(segmentData, sampleRate)
|
|
391
|
+
} else {
|
|
392
|
+
SpectralFeatures()
|
|
400
393
|
}
|
|
401
394
|
|
|
402
395
|
val tempo = try {
|
|
@@ -413,23 +406,220 @@ class AudioProcessor(private val filesDir: File) {
|
|
|
413
406
|
0f
|
|
414
407
|
}
|
|
415
408
|
|
|
409
|
+
val spectralContrast = try {
|
|
410
|
+
if (featureOptions["spectralContrast"] == true) computeSpectralContrast(segmentData, sampleRate) else emptyList()
|
|
411
|
+
} catch (e: Exception) {
|
|
412
|
+
Log.e("AudioProcessor", "Failed to compute spectral contrast: ${e.message}", e)
|
|
413
|
+
emptyList()
|
|
414
|
+
}
|
|
415
|
+
|
|
416
|
+
val tonnetz = try {
|
|
417
|
+
if (featureOptions["tonnetz"] == true) computeTonnetz(segmentData, sampleRate) else emptyList()
|
|
418
|
+
} catch (e: Exception) {
|
|
419
|
+
Log.e("AudioProcessor", "Failed to compute tonnetz: ${e.message}", e)
|
|
420
|
+
emptyList()
|
|
421
|
+
}
|
|
422
|
+
|
|
423
|
+
val pitch = if (featureOptions["pitch"] == true) estimatePitch(segmentData, sampleRate) else 0.0f
|
|
424
|
+
|
|
425
|
+
val crc32Value = if (featureOptions["crc32"] == true) {
|
|
426
|
+
val byteBuffer = ByteBuffer.allocate(segmentData.size * 4)
|
|
427
|
+
.order(ByteOrder.LITTLE_ENDIAN)
|
|
428
|
+
segmentData.forEach { value ->
|
|
429
|
+
byteBuffer.putFloat(value)
|
|
430
|
+
}
|
|
431
|
+
|
|
432
|
+
val crc32 = CRC32()
|
|
433
|
+
crc32.update(byteBuffer.array())
|
|
434
|
+
crc32.value
|
|
435
|
+
} else null
|
|
436
|
+
|
|
416
437
|
return Features(
|
|
417
438
|
energy = energy,
|
|
418
439
|
mfcc = mfcc,
|
|
419
440
|
rms = rms,
|
|
420
|
-
zcr = zcr,
|
|
421
441
|
minAmplitude = minAmplitude,
|
|
422
442
|
maxAmplitude = maxAmplitude,
|
|
423
|
-
|
|
424
|
-
|
|
425
|
-
|
|
426
|
-
|
|
427
|
-
|
|
443
|
+
zcr = zcr,
|
|
444
|
+
spectralCentroid = spectralFeatures.centroid,
|
|
445
|
+
spectralFlatness = spectralFeatures.flatness,
|
|
446
|
+
spectralRollOff = spectralFeatures.rollOff,
|
|
447
|
+
spectralBandwidth = spectralFeatures.bandwidth,
|
|
428
448
|
tempo = tempo,
|
|
429
|
-
hnr = hnr
|
|
449
|
+
hnr = hnr,
|
|
450
|
+
melSpectrogram = melSpectrogram,
|
|
451
|
+
chromagram = chroma,
|
|
452
|
+
spectralContrast = spectralContrast,
|
|
453
|
+
tonnetz = tonnetz,
|
|
454
|
+
pitch = pitch,
|
|
455
|
+
crc32 = crc32Value
|
|
456
|
+
)
|
|
457
|
+
}
|
|
458
|
+
|
|
459
|
+
private fun extractTempo(segmentData: FloatArray, sampleRate: Float): Float {
|
|
460
|
+
val hopLength = 512
|
|
461
|
+
val frameLength = 2048
|
|
462
|
+
|
|
463
|
+
// Compute onset strength signal using spectral flux
|
|
464
|
+
val onsetEnvelope = mutableListOf<Float>()
|
|
465
|
+
var previousSpectrum = FloatArray(frameLength / 2)
|
|
466
|
+
|
|
467
|
+
// Process frames with spectral flux
|
|
468
|
+
for (i in 0 until segmentData.size - frameLength step hopLength) {
|
|
469
|
+
val frame = segmentData.slice(i until minOf(i + frameLength, segmentData.size)).toFloatArray()
|
|
470
|
+
val fft = FFT(frameLength)
|
|
471
|
+
val fftData = frame.copyOf(frameLength)
|
|
472
|
+
fft.realForward(fftData)
|
|
473
|
+
|
|
474
|
+
// Compute magnitude spectrum
|
|
475
|
+
val magnitudes = FloatArray(frameLength / 2)
|
|
476
|
+
for (j in magnitudes.indices) {
|
|
477
|
+
val re = fftData[2 * j]
|
|
478
|
+
val im = if (2 * j + 1 < fftData.size) fftData[2 * j + 1] else 0f
|
|
479
|
+
magnitudes[j] = sqrt(re * re + im * im)
|
|
480
|
+
}
|
|
481
|
+
|
|
482
|
+
// Calculate spectral flux (sum of positive differences)
|
|
483
|
+
var flux = 0f
|
|
484
|
+
for (j in magnitudes.indices) {
|
|
485
|
+
flux += maxOf(magnitudes[j] - previousSpectrum[j], 0f)
|
|
486
|
+
}
|
|
487
|
+
onsetEnvelope.add(flux)
|
|
488
|
+
previousSpectrum = magnitudes
|
|
489
|
+
}
|
|
490
|
+
|
|
491
|
+
// Find peaks in onset envelope
|
|
492
|
+
val peaks = mutableListOf<Int>()
|
|
493
|
+
for (i in 1 until onsetEnvelope.size - 1) {
|
|
494
|
+
if (onsetEnvelope[i] > onsetEnvelope[i-1] && onsetEnvelope[i] > onsetEnvelope[i+1]) {
|
|
495
|
+
peaks.add(i)
|
|
496
|
+
}
|
|
497
|
+
}
|
|
498
|
+
|
|
499
|
+
// Calculate tempo from peak intervals
|
|
500
|
+
return if (peaks.size > 1) {
|
|
501
|
+
val intervals = peaks.zipWithNext { a, b -> b - a }
|
|
502
|
+
val averageInterval = intervals.average().toFloat()
|
|
503
|
+
60f * sampleRate / (hopLength * averageInterval)
|
|
504
|
+
} else {
|
|
505
|
+
120f // Default tempo if no clear peaks found
|
|
506
|
+
}
|
|
507
|
+
}
|
|
508
|
+
|
|
509
|
+
private fun extractSpectralFeatures(samples: FloatArray, sampleRate: Float): SpectralFeatures {
|
|
510
|
+
// FFT requires a fixed-size buffer (N_FFT). If our input is larger,
|
|
511
|
+
// we'll analyze just the first N_FFT samples to prevent buffer overflow.
|
|
512
|
+
// This is a common practice in audio analysis where we process chunks
|
|
513
|
+
// of consistent size rather than variable-length segments.
|
|
514
|
+
val windowed = if (samples.size > N_FFT) {
|
|
515
|
+
// If samples are larger than FFT size, take the first N_FFT samples
|
|
516
|
+
applyHannWindow(samples.copyOf(N_FFT))
|
|
517
|
+
} else {
|
|
518
|
+
applyHannWindow(samples)
|
|
519
|
+
}
|
|
520
|
+
|
|
521
|
+
// Create padded array for FFT, ensuring we don't exceed N_FFT size
|
|
522
|
+
// Zero padding is automatic since FloatArray initializes with zeros
|
|
523
|
+
val paddedSamples = FloatArray(N_FFT).also { padded ->
|
|
524
|
+
windowed.copyInto(padded, 0, 0, minOf(windowed.size, N_FFT))
|
|
525
|
+
}
|
|
526
|
+
|
|
527
|
+
// Perform FFT
|
|
528
|
+
val fft = FFT(N_FFT)
|
|
529
|
+
fft.realForward(paddedSamples)
|
|
530
|
+
|
|
531
|
+
// Calculate magnitude spectrum (only need first half due to symmetry)
|
|
532
|
+
// Add 1 to include both DC (0 Hz) and Nyquist frequency components
|
|
533
|
+
val magnitudeSpectrum = FloatArray(N_FFT / 2 + 1)
|
|
534
|
+
for (i in 0 until N_FFT / 2) { // Since we're only going up to N_FFT/2, the check is unnecessary
|
|
535
|
+
val re = paddedSamples[2 * i]
|
|
536
|
+
val im = paddedSamples[2 * i + 1] // This will always be within bounds
|
|
537
|
+
magnitudeSpectrum[i] = sqrt(re * re + im * im)
|
|
538
|
+
}
|
|
539
|
+
// Handle Nyquist frequency component separately
|
|
540
|
+
magnitudeSpectrum[N_FFT / 2] = abs(paddedSamples[1])
|
|
541
|
+
|
|
542
|
+
// Compute power spectrum for spectral flatness
|
|
543
|
+
val powerSpectrum = magnitudeSpectrum.map { it * it }.toFloatArray()
|
|
544
|
+
|
|
545
|
+
// Compute spectral features
|
|
546
|
+
val centroid = computeSpectralCentroid(magnitudeSpectrum, sampleRate)
|
|
547
|
+
val flatness = computeSpectralFlatness(powerSpectrum)
|
|
548
|
+
val rollOff = computeSpectralRollOff(magnitudeSpectrum, sampleRate)
|
|
549
|
+
val bandwidth = computeSpectralBandwidth(magnitudeSpectrum, sampleRate, centroid)
|
|
550
|
+
|
|
551
|
+
return SpectralFeatures(
|
|
552
|
+
centroid = centroid,
|
|
553
|
+
flatness = flatness,
|
|
554
|
+
rollOff = rollOff,
|
|
555
|
+
bandwidth = bandwidth
|
|
430
556
|
)
|
|
431
557
|
}
|
|
432
558
|
|
|
559
|
+
private fun computeSpectralCentroid(magnitudeSpectrum: FloatArray, sampleRate: Float): Float {
|
|
560
|
+
val sum = magnitudeSpectrum.sum()
|
|
561
|
+
if (sum == 0f) return 0f
|
|
562
|
+
|
|
563
|
+
val weightedSum = magnitudeSpectrum.mapIndexed { index, value ->
|
|
564
|
+
index * (sampleRate / N_FFT) * value
|
|
565
|
+
}.sum()
|
|
566
|
+
|
|
567
|
+
return weightedSum / sum
|
|
568
|
+
}
|
|
569
|
+
|
|
570
|
+
private fun computeSpectralFlatness(powerSpectrum: FloatArray): Float {
|
|
571
|
+
// Calculate geometric mean using log-space to avoid numerical issues
|
|
572
|
+
var sumLogValues = 0.0f
|
|
573
|
+
for (value in powerSpectrum) {
|
|
574
|
+
sumLogValues += ln(value + 1e-10f) // Add small epsilon to avoid log(0)
|
|
575
|
+
}
|
|
576
|
+
val geometricMean = exp(sumLogValues / powerSpectrum.size)
|
|
577
|
+
|
|
578
|
+
// Calculate arithmetic mean
|
|
579
|
+
val arithmeticMean = powerSpectrum.sum() / powerSpectrum.size
|
|
580
|
+
|
|
581
|
+
return if (arithmeticMean != 0f) geometricMean / arithmeticMean else 0f
|
|
582
|
+
}
|
|
583
|
+
|
|
584
|
+
private fun computeSpectralRollOff(magnitudeSpectrum: FloatArray, sampleRate: Float): Float {
|
|
585
|
+
val totalEnergy = magnitudeSpectrum.sum()
|
|
586
|
+
var cumulativeEnergy = 0f
|
|
587
|
+
val rollOffThreshold = totalEnergy * 0.85f
|
|
588
|
+
|
|
589
|
+
for ((index, value) in magnitudeSpectrum.withIndex()) {
|
|
590
|
+
cumulativeEnergy += value
|
|
591
|
+
if (cumulativeEnergy >= rollOffThreshold) {
|
|
592
|
+
return index * (sampleRate / N_FFT)
|
|
593
|
+
}
|
|
594
|
+
}
|
|
595
|
+
|
|
596
|
+
return 0f
|
|
597
|
+
}
|
|
598
|
+
|
|
599
|
+
private fun computeSpectralBandwidth(
|
|
600
|
+
magnitudeSpectrum: FloatArray,
|
|
601
|
+
sampleRate: Float,
|
|
602
|
+
centroid: Float
|
|
603
|
+
): Float {
|
|
604
|
+
val sum = magnitudeSpectrum.sum()
|
|
605
|
+
if (sum == 0f) return 0f
|
|
606
|
+
|
|
607
|
+
// Match iOS frequency calculation
|
|
608
|
+
val weightedSum = magnitudeSpectrum.mapIndexed { index, value ->
|
|
609
|
+
val freq = index * sampleRate / (2 * magnitudeSpectrum.size)
|
|
610
|
+
value * (freq - centroid).pow(2)
|
|
611
|
+
}.sum()
|
|
612
|
+
|
|
613
|
+
return sqrt(weightedSum / sum)
|
|
614
|
+
}
|
|
615
|
+
|
|
616
|
+
private data class SpectralFeatures(
|
|
617
|
+
val centroid: Float = 0f,
|
|
618
|
+
val flatness: Float = 0f,
|
|
619
|
+
val rollOff: Float = 0f,
|
|
620
|
+
val bandwidth: Float = 0f
|
|
621
|
+
)
|
|
622
|
+
|
|
433
623
|
/**
|
|
434
624
|
* Resets the segment data.
|
|
435
625
|
* @param sumSquaresUpdater Function to reset sum of squares.
|
|
@@ -453,45 +643,38 @@ class AudioProcessor(private val filesDir: File) {
|
|
|
453
643
|
}
|
|
454
644
|
|
|
455
645
|
/**
|
|
456
|
-
*
|
|
457
|
-
* @param segmentData The segment data.
|
|
458
|
-
* @param sampleRate The sample rate of the audio data.
|
|
459
|
-
* @return The MFCC coefficients.
|
|
646
|
+
* Computes the MFCC (Mel-Frequency Cepstral Coefficients) from the audio data.
|
|
460
647
|
*/
|
|
461
|
-
private fun
|
|
462
|
-
|
|
463
|
-
|
|
464
|
-
|
|
465
|
-
|
|
466
|
-
|
|
467
|
-
|
|
468
|
-
val fft = FFT(fftData.size)
|
|
469
|
-
fft.realForward(fftData)
|
|
648
|
+
private fun computeMFCC(samples: FloatArray, sampleRate: Float): List<Float> {
|
|
649
|
+
val (powerSpectrum, _) = prepareFFT(samples, sampleRate)
|
|
650
|
+
val melFilters = computeMelFilterbank(
|
|
651
|
+
numFilters = 26,
|
|
652
|
+
powerSpectrumSize = powerSpectrum.size,
|
|
653
|
+
sampleRate = sampleRate
|
|
654
|
+
)
|
|
470
655
|
|
|
471
|
-
|
|
472
|
-
|
|
473
|
-
fftData.map { it * it }.chunked(2) { (re, im) -> sqrt(re + im) }
|
|
474
|
-
} catch (e: Exception) {
|
|
475
|
-
Log.e("AudioProcessor", "Error computing power spectrum: ${e.message}", e)
|
|
656
|
+
if (melFilters.any { it.size != powerSpectrum.size }) {
|
|
657
|
+
Log.e("AudioProcessor", "Mel filter size (${melFilters[0].size}) does not match power spectrum size (${powerSpectrum.size})")
|
|
476
658
|
return emptyList()
|
|
477
659
|
}
|
|
478
660
|
|
|
479
|
-
|
|
480
|
-
|
|
481
|
-
|
|
482
|
-
|
|
661
|
+
val melEnergies = FloatArray(26) { i ->
|
|
662
|
+
var energy = 0f
|
|
663
|
+
for (j in powerSpectrum.indices) {
|
|
664
|
+
energy += powerSpectrum[j] * melFilters[i][j]
|
|
665
|
+
}
|
|
666
|
+
ln(maxOf(energy, 1e-10f))
|
|
483
667
|
}
|
|
484
668
|
|
|
485
|
-
|
|
486
|
-
|
|
487
|
-
|
|
488
|
-
|
|
489
|
-
|
|
490
|
-
|
|
491
|
-
} catch (e: Exception) {
|
|
492
|
-
Log.e("AudioProcessor", "Error computing DCT: ${e.message}", e)
|
|
493
|
-
emptyList()
|
|
669
|
+
val mfcc = FloatArray(13) { i ->
|
|
670
|
+
var sum = 0f
|
|
671
|
+
for (j in melEnergies.indices) {
|
|
672
|
+
sum += melEnergies[j] * cos(PI * i * (2 * j + 1) / (2 * 26)).toFloat()
|
|
673
|
+
}
|
|
674
|
+
sum * sqrt(2f / 26)
|
|
494
675
|
}
|
|
676
|
+
|
|
677
|
+
return mfcc.toList()
|
|
495
678
|
}
|
|
496
679
|
|
|
497
680
|
/**
|
|
@@ -501,32 +684,53 @@ class AudioProcessor(private val filesDir: File) {
|
|
|
501
684
|
* @param sampleRate The sample rate of the audio data.
|
|
502
685
|
* @return A list of Mel filters.
|
|
503
686
|
*/
|
|
504
|
-
private fun
|
|
505
|
-
val
|
|
506
|
-
val
|
|
507
|
-
|
|
508
|
-
|
|
687
|
+
private fun computeMelFilterbank(numFilters: Int, powerSpectrumSize: Int, sampleRate: Float): Array<FloatArray> {
|
|
688
|
+
val fMin = 0f
|
|
689
|
+
val fMax = sampleRate / 2
|
|
690
|
+
|
|
691
|
+
// Convert Hz to Mel
|
|
692
|
+
val melMin = hzToMel(fMin)
|
|
693
|
+
val melMax = hzToMel(fMax)
|
|
694
|
+
|
|
695
|
+
// Create equally spaced points in Mel scale
|
|
696
|
+
val melPoints = FloatArray(numFilters + 2)
|
|
697
|
+
val melStep = (melMax - melMin) / (numFilters + 1)
|
|
698
|
+
for (i in melPoints.indices) {
|
|
699
|
+
melPoints[i] = melMin + i * melStep
|
|
509
700
|
}
|
|
510
701
|
|
|
511
|
-
|
|
512
|
-
val
|
|
702
|
+
// Convert back to Hz
|
|
703
|
+
val hzPoints = melPoints.map { melToHz(it) }
|
|
704
|
+
|
|
705
|
+
// Convert to FFT bin numbers, clamping to valid range
|
|
706
|
+
val bins = hzPoints.map { minOf((it * powerSpectrumSize / sampleRate).roundToInt(), powerSpectrumSize - 1) }.toList()
|
|
707
|
+
|
|
708
|
+
// Create the filterbank matrix with size matching powerSpectrumSize
|
|
709
|
+
val filterbank = Array(numFilters) { FloatArray(powerSpectrumSize) { 0f } }
|
|
513
710
|
|
|
514
|
-
|
|
515
|
-
|
|
516
|
-
|
|
517
|
-
|
|
518
|
-
|
|
711
|
+
// Ensure safe access to bins by limiting the loop and checking boundaries
|
|
712
|
+
for (i in 0 until numFilters) {
|
|
713
|
+
if (i + 2 < bins.size) { // Check to prevent out-of-bounds access
|
|
714
|
+
val startBin = bins[i]
|
|
715
|
+
val centerBin = bins[i + 1]
|
|
716
|
+
val endBin = bins[i + 2]
|
|
717
|
+
|
|
718
|
+
// Left slope (ascending triangle)
|
|
719
|
+
if (centerBin > startBin) {
|
|
720
|
+
for (j in startBin until centerBin) {
|
|
721
|
+
filterbank[i][j] = (j - startBin).toFloat() / (centerBin - startBin).toFloat()
|
|
722
|
+
}
|
|
519
723
|
}
|
|
520
|
-
|
|
521
|
-
|
|
522
|
-
|
|
523
|
-
|
|
724
|
+
// Right slope (descending triangle)
|
|
725
|
+
if (endBin > centerBin) {
|
|
726
|
+
for (j in centerBin until endBin) {
|
|
727
|
+
filterbank[i][j] = (endBin - j).toFloat() / (endBin - centerBin).toFloat()
|
|
728
|
+
}
|
|
524
729
|
}
|
|
525
730
|
}
|
|
526
|
-
melFilters.add(filter.toList())
|
|
527
731
|
}
|
|
528
732
|
|
|
529
|
-
return
|
|
733
|
+
return filterbank
|
|
530
734
|
}
|
|
531
735
|
|
|
532
736
|
/**
|
|
@@ -550,168 +754,11 @@ class AudioProcessor(private val filesDir: File) {
|
|
|
550
754
|
return dct.toList()
|
|
551
755
|
}
|
|
552
756
|
|
|
553
|
-
/**
|
|
554
|
-
* Extracts the spectral centroid from the audio data.
|
|
555
|
-
* @param segmentData The segment data.
|
|
556
|
-
* @param sampleRate The sample rate of the audio data.
|
|
557
|
-
* @return The spectral centroid.
|
|
558
|
-
*/
|
|
559
|
-
private fun extractSpectralCentroid(segmentData: FloatArray, sampleRate: Float): Float {
|
|
560
|
-
val magnitudeSpectrum = segmentData.map { it * it }.toFloatArray()
|
|
561
|
-
val sum = magnitudeSpectrum.sum()
|
|
562
|
-
if (sum == 0f) return 0f
|
|
563
|
-
|
|
564
|
-
val weightedSum = magnitudeSpectrum.mapIndexed { index, value -> index * value }.sum()
|
|
565
|
-
return (weightedSum / sum) * (sampleRate / 2) / magnitudeSpectrum.size
|
|
566
|
-
}
|
|
567
|
-
|
|
568
|
-
/**
|
|
569
|
-
* Extracts the spectral flatness from the audio data.
|
|
570
|
-
* @param segmentData The segment data.
|
|
571
|
-
* @return The spectral flatness.
|
|
572
|
-
*/
|
|
573
|
-
private fun extractSpectralFlatness(segmentData: FloatArray): Float {
|
|
574
|
-
val magnitudeSpectrum = segmentData.map { abs(it) }
|
|
575
|
-
val geometricMean = exp(magnitudeSpectrum.map { ln(it + Float.MIN_VALUE) }.average()).toFloat()
|
|
576
|
-
val arithmeticMean = magnitudeSpectrum.average().toFloat()
|
|
577
|
-
return if (arithmeticMean != 0f) geometricMean / arithmeticMean else 0f
|
|
578
|
-
}
|
|
579
|
-
|
|
580
|
-
/**
|
|
581
|
-
* Extracts the spectral roll-off from the audio data.
|
|
582
|
-
* @param segmentData The segment data.
|
|
583
|
-
* @param sampleRate The sample rate of the audio data.
|
|
584
|
-
* @return The spectral roll-off.
|
|
585
|
-
*/
|
|
586
|
-
private fun extractSpectralRollOff(segmentData: FloatArray, sampleRate: Float): Float {
|
|
587
|
-
val magnitudeSpectrum = segmentData.map { abs(it) }
|
|
588
|
-
val totalEnergy = magnitudeSpectrum.sum()
|
|
589
|
-
var cumulativeEnergy = 0f
|
|
590
|
-
val rollOffThreshold = totalEnergy * 0.85f
|
|
591
|
-
|
|
592
|
-
for ((index, value) in magnitudeSpectrum.withIndex()) {
|
|
593
|
-
cumulativeEnergy += value
|
|
594
|
-
if (cumulativeEnergy >= rollOffThreshold) {
|
|
595
|
-
return index.toFloat() / magnitudeSpectrum.size * (sampleRate / 2)
|
|
596
|
-
}
|
|
597
|
-
}
|
|
598
|
-
|
|
599
|
-
return 0f
|
|
600
|
-
}
|
|
601
|
-
|
|
602
|
-
/**
|
|
603
|
-
* Extracts the spectral bandwidth from the audio data.
|
|
604
|
-
* @param segmentData The segment data.
|
|
605
|
-
* @param sampleRate The sample rate of the audio data.
|
|
606
|
-
* @return The spectral bandwidth.
|
|
607
|
-
*/
|
|
608
|
-
private fun extractSpectralBandwidth(segmentData: FloatArray, sampleRate: Float): Float {
|
|
609
|
-
val centroid = extractSpectralCentroid(segmentData, sampleRate)
|
|
610
|
-
val magnitudeSpectrum = segmentData.map { abs(it) }
|
|
611
|
-
val sum = magnitudeSpectrum.sum()
|
|
612
|
-
if (sum == 0f) return 0f
|
|
613
|
-
|
|
614
|
-
val weightedSum = magnitudeSpectrum.mapIndexed { index, value -> value * (index - centroid).pow(2) }.sum()
|
|
615
|
-
return sqrt(weightedSum / sum)
|
|
616
|
-
}
|
|
617
|
-
|
|
618
|
-
/**
|
|
619
|
-
* Extracts the chromagram from the audio data.
|
|
620
|
-
* @param segmentData The segment data.
|
|
621
|
-
* @param sampleRate The sample rate of the audio data.
|
|
622
|
-
* @return The chromagram.
|
|
623
|
-
*/
|
|
624
|
-
private fun extractChromagram(segmentData: FloatArray, sampleRate: Float): List<Float> {
|
|
625
|
-
val fftData = segmentData.copyOf()
|
|
626
|
-
val fft = FFT(fftData.size)
|
|
627
|
-
fft.realForward(fftData)
|
|
628
|
-
|
|
629
|
-
// Compute the magnitude spectrum
|
|
630
|
-
val magnitudeSpectrum = fftData.map { abs(it) }
|
|
631
|
-
|
|
632
|
-
// Initialize the chromagram with 12 bins (one for each pitch class)
|
|
633
|
-
val chromagram = FloatArray(12)
|
|
634
|
-
|
|
635
|
-
// Map frequencies to pitch classes
|
|
636
|
-
for (i in magnitudeSpectrum.indices) {
|
|
637
|
-
val freq = i * sampleRate / magnitudeSpectrum.size
|
|
638
|
-
val pitchClass = (12 * log2(freq / 440.0) % 12).toInt()
|
|
639
|
-
if (pitchClass in 0..11) {
|
|
640
|
-
chromagram[pitchClass] += magnitudeSpectrum[i]
|
|
641
|
-
}
|
|
642
|
-
}
|
|
643
|
-
|
|
644
|
-
return chromagram.toList()
|
|
645
|
-
}
|
|
646
|
-
|
|
647
|
-
/**
|
|
648
|
-
* Extracts the tempo from the audio data.
|
|
649
|
-
* @param segmentData The segment data.
|
|
650
|
-
* @param sampleRate The sample rate of the audio data.
|
|
651
|
-
* @return The tempo.
|
|
652
|
-
*/
|
|
653
|
-
private fun extractTempo(segmentData: FloatArray, sampleRate: Float): Float {
|
|
654
|
-
// Calculate the onset strength envelope
|
|
655
|
-
val onsetEnv = calculateOnsetEnvelope(segmentData, sampleRate)
|
|
656
|
-
|
|
657
|
-
// Find peaks in the onset envelope
|
|
658
|
-
val peaks = findPeaks(onsetEnv)
|
|
659
|
-
|
|
660
|
-
// Calculate the inter-onset intervals (IOIs)
|
|
661
|
-
val iois = peaks.zipWithNext { a, b -> (b - a).toFloat() / sampleRate }
|
|
662
|
-
|
|
663
|
-
// Calculate the tempo in beats per minute (BPM)
|
|
664
|
-
val avgIoi = iois.average().toFloat()
|
|
665
|
-
return if (avgIoi != 0f) 60f / avgIoi else 0f
|
|
666
|
-
}
|
|
667
|
-
|
|
668
|
-
/**
|
|
669
|
-
* Calculates the onset envelope of the audio signal.
|
|
670
|
-
* @param segmentData The segment data.
|
|
671
|
-
* @param sampleRate The sample rate of the audio data.
|
|
672
|
-
* @return The onset envelope.
|
|
673
|
-
*/
|
|
674
|
-
private fun calculateOnsetEnvelope(segmentData: FloatArray, sampleRate: Float): FloatArray {
|
|
675
|
-
val frameSize = sampleRate.toInt() / 100 // Assume 10ms frames
|
|
676
|
-
val onsetEnv = FloatArray(segmentData.size / frameSize)
|
|
677
|
-
var previousSpectrum = FloatArray(frameSize)
|
|
678
|
-
|
|
679
|
-
for (i in onsetEnv.indices) {
|
|
680
|
-
val frame = segmentData.sliceArray(i * frameSize until min((i + 1) * frameSize, segmentData.size))
|
|
681
|
-
val magnitudeSpectrum = frame.map { abs(it) }.toFloatArray()
|
|
682
|
-
val onset = magnitudeSpectrum.zip(previousSpectrum) { a, b -> max(0f, a - b) }.sum()
|
|
683
|
-
onsetEnv[i] = onset
|
|
684
|
-
previousSpectrum = magnitudeSpectrum
|
|
685
|
-
}
|
|
686
|
-
|
|
687
|
-
return onsetEnv
|
|
688
|
-
}
|
|
689
|
-
|
|
690
|
-
/**
|
|
691
|
-
* Finds the peaks in the onset envelope.
|
|
692
|
-
* @param onsetEnv The onset envelope.
|
|
693
|
-
* @return A list of peak indices.
|
|
694
|
-
*/
|
|
695
|
-
private fun findPeaks(onsetEnv: FloatArray): List<Int> {
|
|
696
|
-
val peaks = mutableListOf<Int>()
|
|
697
|
-
for (i in 1 until onsetEnv.size - 1) {
|
|
698
|
-
if (onsetEnv[i] > onsetEnv[i - 1] && onsetEnv[i] > onsetEnv[i + 1]) {
|
|
699
|
-
peaks.add(i)
|
|
700
|
-
}
|
|
701
|
-
}
|
|
702
|
-
return peaks
|
|
703
|
-
}
|
|
704
|
-
|
|
705
757
|
/**
|
|
706
758
|
* Extracts the HNR (Harmonics-to-Noise Ratio) from the audio data.
|
|
707
759
|
* @param segmentData The segment data.
|
|
708
760
|
* @return The HNR.
|
|
709
761
|
*/
|
|
710
|
-
/**
|
|
711
|
-
* Extracts the HNR (Harmonics-to-Noise Ratio) from the audio data.
|
|
712
|
-
* @param segmentData The segment data as FloatArray.
|
|
713
|
-
* @return The HNR.
|
|
714
|
-
*/
|
|
715
762
|
private fun extractHNR(segmentData: FloatArray): Float {
|
|
716
763
|
val frameSize = segmentData.size
|
|
717
764
|
val autocorrelation = FloatArray(frameSize)
|
|
@@ -725,11 +772,33 @@ class AudioProcessor(private val filesDir: File) {
|
|
|
725
772
|
autocorrelation[i] = sum
|
|
726
773
|
}
|
|
727
774
|
|
|
728
|
-
// Find
|
|
729
|
-
val maxAutocorrelation = autocorrelation.
|
|
775
|
+
// Find peaks with minimum prominence
|
|
776
|
+
val maxAutocorrelation = autocorrelation.maxOrNull() ?: 0f
|
|
777
|
+
val peaks = findPeaks(autocorrelation, minProminence = 0.1f * maxAutocorrelation)
|
|
778
|
+
|
|
779
|
+
if (peaks.isNotEmpty()) {
|
|
780
|
+
val firstPeakIndex = peaks.firstOrNull { it > 0 } ?: 0
|
|
781
|
+
val harmonicEnergy = autocorrelation[firstPeakIndex]
|
|
782
|
+
val noiseEnergy = autocorrelation[0] - harmonicEnergy
|
|
783
|
+
if (noiseEnergy > 0) {
|
|
784
|
+
return 10 * log10(harmonicEnergy / noiseEnergy)
|
|
785
|
+
}
|
|
786
|
+
}
|
|
787
|
+
|
|
788
|
+
return 0f
|
|
789
|
+
}
|
|
730
790
|
|
|
731
|
-
|
|
732
|
-
|
|
791
|
+
private fun findPeaks(data: FloatArray, minProminence: Float): List<Int> {
|
|
792
|
+
val peaks = mutableListOf<Int>()
|
|
793
|
+
for (i in 1 until data.size - 1) {
|
|
794
|
+
if (data[i] > data[i - 1] && data[i] > data[i + 1]) {
|
|
795
|
+
val prominence = data[i] - maxOf(data[i - 1], data[i + 1])
|
|
796
|
+
if (prominence >= minProminence) {
|
|
797
|
+
peaks.add(i)
|
|
798
|
+
}
|
|
799
|
+
}
|
|
800
|
+
}
|
|
801
|
+
return peaks
|
|
733
802
|
}
|
|
734
803
|
|
|
735
804
|
fun loadAudioFromAnyFormat(fileUri: String, decodingConfig: DecodingConfig? = null): AudioData? {
|
|
@@ -799,7 +868,7 @@ class AudioProcessor(private val filesDir: File) {
|
|
|
799
868
|
// If MediaExtractor failed and file is WAV, try WAV parser
|
|
800
869
|
if (file.name.lowercase().endsWith(".wav")) {
|
|
801
870
|
Log.d("AudioProcessor", "Falling back to WAV parser")
|
|
802
|
-
return loadAudioFile(file.absolutePath
|
|
871
|
+
return loadAudioFile(file.absolutePath)?.let { wavData ->
|
|
803
872
|
if (decodingConfig != null) {
|
|
804
873
|
val processedData = processAudio(
|
|
805
874
|
wavData.data,
|
|
@@ -987,7 +1056,7 @@ class AudioProcessor(private val filesDir: File) {
|
|
|
987
1056
|
val inputBuffer = ByteBuffer.wrap(pcmData).order(ByteOrder.LITTLE_ENDIAN).asShortBuffer()
|
|
988
1057
|
val outputBuffer = ByteBuffer.wrap(result).order(ByteOrder.LITTLE_ENDIAN).asShortBuffer()
|
|
989
1058
|
|
|
990
|
-
for (i in
|
|
1059
|
+
for (i in result.indices) {
|
|
991
1060
|
val channelData = ShortArray(targetChannels)
|
|
992
1061
|
for (j in 0 until targetChannels) {
|
|
993
1062
|
channelData[j] = inputBuffer.get()
|
|
@@ -1076,6 +1145,8 @@ class AudioProcessor(private val filesDir: File) {
|
|
|
1076
1145
|
val dataPoints = mutableListOf<DataPoint>()
|
|
1077
1146
|
var minAmplitude = Float.MAX_VALUE
|
|
1078
1147
|
var maxAmplitude = Float.MIN_VALUE
|
|
1148
|
+
var minRms = Float.MAX_VALUE // Add minRms
|
|
1149
|
+
var maxRms = Float.MIN_VALUE // Add maxRms
|
|
1079
1150
|
|
|
1080
1151
|
val extractionTimeMs = measureTimeMillis {
|
|
1081
1152
|
for (i in 0 until numberOfPoints) {
|
|
@@ -1098,22 +1169,27 @@ class AudioProcessor(private val filesDir: File) {
|
|
|
1098
1169
|
val startTimePoint = ((pointStartSample * 1000L) / (audioData.sampleRate * audioData.channels)).toFloat()
|
|
1099
1170
|
val endTimePoint = ((pointEndSample * 1000L) / (audioData.sampleRate * audioData.channels)).toFloat()
|
|
1100
1171
|
|
|
1101
|
-
val
|
|
1102
|
-
|
|
1103
|
-
else -> sqrt(segmentData.map { it * it }.average().toFloat())
|
|
1104
|
-
}
|
|
1172
|
+
val rms = sqrt(segmentData.map { it * it }.average().toFloat())
|
|
1173
|
+
val amplitude = segmentData.maxOf { abs(it) } // Always use peak amplitude
|
|
1105
1174
|
|
|
1106
1175
|
minAmplitude = minOf(minAmplitude, amplitude)
|
|
1107
1176
|
maxAmplitude = maxOf(maxAmplitude, amplitude)
|
|
1177
|
+
minRms = minOf(minRms, rms)
|
|
1178
|
+
maxRms = maxOf(maxRms, rms)
|
|
1108
1179
|
|
|
1109
1180
|
dataPoints.add(DataPoint(
|
|
1110
1181
|
id = i.toLong(),
|
|
1111
|
-
amplitude = amplitude,
|
|
1182
|
+
amplitude = amplitude, // Peak amplitude
|
|
1183
|
+
rms = rms, // RMS value
|
|
1184
|
+
dB = 20 * log10(amplitude.toDouble()).toFloat(),
|
|
1185
|
+
silent = amplitude < 0.01,
|
|
1186
|
+
features = null,
|
|
1187
|
+
speech = null,
|
|
1112
1188
|
startTime = startTimePoint,
|
|
1113
1189
|
endTime = endTimePoint,
|
|
1114
1190
|
startPosition = pointStartSample,
|
|
1115
1191
|
endPosition = pointEndSample,
|
|
1116
|
-
samples =
|
|
1192
|
+
samples = segmentData.size
|
|
1117
1193
|
))
|
|
1118
1194
|
} catch (e: Exception) {
|
|
1119
1195
|
Log.e(Constants.TAG, "Error processing segment $i: ${e.message}")
|
|
@@ -1127,7 +1203,7 @@ class AudioProcessor(private val filesDir: File) {
|
|
|
1127
1203
|
}
|
|
1128
1204
|
|
|
1129
1205
|
return AudioAnalysisData(
|
|
1130
|
-
|
|
1206
|
+
segmentDurationMs = config.segmentDurationMs,
|
|
1131
1207
|
durationMs = durationMs.toInt(),
|
|
1132
1208
|
bitDepth = audioData.bitDepth,
|
|
1133
1209
|
numberOfChannels = audioData.channels,
|
|
@@ -1135,7 +1211,7 @@ class AudioProcessor(private val filesDir: File) {
|
|
|
1135
1211
|
samples = samplesInRange,
|
|
1136
1212
|
dataPoints = dataPoints,
|
|
1137
1213
|
amplitudeRange = AudioAnalysisData.AmplitudeRange(minAmplitude, maxAmplitude),
|
|
1138
|
-
|
|
1214
|
+
rmsRange = AudioAnalysisData.AmplitudeRange(minRms, maxRms),
|
|
1139
1215
|
extractionTimeMs = extractionTimeMs.toFloat()
|
|
1140
1216
|
)
|
|
1141
1217
|
}
|
|
@@ -1157,32 +1233,34 @@ class AudioProcessor(private val filesDir: File) {
|
|
|
1157
1233
|
return bytes.map { (it.toInt() - 128).toFloat() / 127f }.toFloatArray()
|
|
1158
1234
|
}
|
|
1159
1235
|
|
|
1160
|
-
fun loadAudioRange(
|
|
1161
|
-
fileUri: String,
|
|
1162
|
-
startTimeMs: Long? = null,
|
|
1163
|
-
endTimeMs: Long? = null,
|
|
1164
|
-
config: DecodingConfig
|
|
1165
|
-
): AudioData? {
|
|
1236
|
+
fun loadAudioRange(fileUri: String, startTimeMs: Long, endTimeMs: Long, config: DecodingConfig? = null): AudioData? {
|
|
1166
1237
|
try {
|
|
1167
|
-
//
|
|
1168
|
-
val
|
|
1169
|
-
|
|
1170
|
-
|
|
1171
|
-
|
|
1172
|
-
|
|
1173
|
-
|
|
1174
|
-
|
|
1175
|
-
// Check if it's a WAV file by reading first 4 bytes
|
|
1176
|
-
val isWav = FileInputStream(file).use { fis ->
|
|
1177
|
-
val header = ByteArray(4)
|
|
1178
|
-
fis.read(header)
|
|
1179
|
-
String(header) == "RIFF"
|
|
1180
|
-
}
|
|
1238
|
+
// Use default config if none provided
|
|
1239
|
+
val effectiveConfig = config ?: DecodingConfig(
|
|
1240
|
+
targetSampleRate = null,
|
|
1241
|
+
targetChannels = null,
|
|
1242
|
+
targetBitDepth = 16,
|
|
1243
|
+
normalizeAudio = false
|
|
1244
|
+
)
|
|
1181
1245
|
|
|
1182
|
-
|
|
1183
|
-
|
|
1246
|
+
// First check if it's a WAV file by extension
|
|
1247
|
+
val isWavByExtension = fileUri.lowercase().endsWith(".wav")
|
|
1248
|
+
|
|
1249
|
+
// Then verify WAV header if needed
|
|
1250
|
+
val headerSize = if (isWavByExtension) {
|
|
1251
|
+
getWavHeaderSize(fileUri)
|
|
1252
|
+
} else null
|
|
1253
|
+
|
|
1254
|
+
// If it's a WAV file (by extension and header verification)
|
|
1255
|
+
return if (isWavByExtension && headerSize != null) {
|
|
1256
|
+
Log.d(Constants.TAG, "Loading WAV range with header size: $headerSize bytes")
|
|
1257
|
+
loadWavRange(fileUri, startTimeMs, endTimeMs, effectiveConfig, headerSize)
|
|
1184
1258
|
} else {
|
|
1185
|
-
|
|
1259
|
+
if (isWavByExtension) {
|
|
1260
|
+
Log.w(Constants.TAG, "File has .wav extension but invalid header, falling back to compressed loader")
|
|
1261
|
+
}
|
|
1262
|
+
Log.d(Constants.TAG, "Loading compressed audio range")
|
|
1263
|
+
loadCompressedAudioRange(fileUri, startTimeMs, endTimeMs, effectiveConfig)
|
|
1186
1264
|
}
|
|
1187
1265
|
} catch (e: Exception) {
|
|
1188
1266
|
Log.e(Constants.TAG, "Failed to load audio range: ${e.message}", e)
|
|
@@ -1191,52 +1269,59 @@ class AudioProcessor(private val filesDir: File) {
|
|
|
1191
1269
|
}
|
|
1192
1270
|
|
|
1193
1271
|
private fun loadWavRange(
|
|
1194
|
-
|
|
1195
|
-
startTimeMs: Long
|
|
1196
|
-
endTimeMs: Long
|
|
1197
|
-
config: DecodingConfig
|
|
1272
|
+
fileUri: String,
|
|
1273
|
+
startTimeMs: Long,
|
|
1274
|
+
endTimeMs: Long,
|
|
1275
|
+
config: DecodingConfig,
|
|
1276
|
+
headerSize: Int
|
|
1198
1277
|
): AudioData? {
|
|
1199
1278
|
try {
|
|
1200
|
-
|
|
1201
|
-
|
|
1202
|
-
|
|
1203
|
-
|
|
1204
|
-
|
|
1205
|
-
|
|
1206
|
-
val sampleRate = ByteBuffer.wrap(headerBuffer, 24, 4).order(ByteOrder.LITTLE_ENDIAN).int
|
|
1207
|
-
val channels = ByteBuffer.wrap(headerBuffer, 22, 2).order(ByteOrder.LITTLE_ENDIAN).short.toInt()
|
|
1208
|
-
val bitDepth = ByteBuffer.wrap(headerBuffer, 34, 2).order(ByteOrder.LITTLE_ENDIAN).short.toInt()
|
|
1209
|
-
|
|
1210
|
-
// Calculate duration
|
|
1211
|
-
val bytesPerFrame = channels * (bitDepth / 8)
|
|
1212
|
-
val numFrames = (file.length() - 44) / bytesPerFrame // Subtract header size
|
|
1213
|
-
val durationMs = (numFrames * 1000L) / sampleRate
|
|
1279
|
+
val file = File(fileUri.removePrefix("file://")).takeIf { it.exists() }
|
|
1280
|
+
?: File(filesDir, File(fileUri).name).takeIf { it.exists() }
|
|
1281
|
+
?: throw IllegalArgumentException("File not found: $fileUri")
|
|
1282
|
+
|
|
1283
|
+
// Use existing method to get audio format
|
|
1284
|
+
val format = getAudioFormat(fileUri) ?: throw IllegalArgumentException("Could not determine audio format")
|
|
1214
1285
|
|
|
1215
|
-
|
|
1216
|
-
val
|
|
1217
|
-
val
|
|
1218
|
-
val length = (endByte - startByte).toInt()
|
|
1286
|
+
val bytesPerSecond = format.sampleRate * format.channels * (format.bitDepth / 8)
|
|
1287
|
+
val startByteOffset = ((startTimeMs * bytesPerSecond) / 1000).toInt()
|
|
1288
|
+
val endByteOffset = ((endTimeMs * bytesPerSecond) / 1000).toInt()
|
|
1219
1289
|
|
|
1290
|
+
val startByte = headerSize + startByteOffset
|
|
1291
|
+
val endByte = headerSize + endByteOffset
|
|
1292
|
+
|
|
1220
1293
|
Log.d(Constants.TAG, """
|
|
1221
|
-
Loading WAV
|
|
1222
|
-
-
|
|
1223
|
-
-
|
|
1224
|
-
-
|
|
1225
|
-
-
|
|
1294
|
+
Loading WAV range:
|
|
1295
|
+
- headerSize: $headerSize
|
|
1296
|
+
- startByte: $startByte
|
|
1297
|
+
- endByte: $endByte
|
|
1298
|
+
- bytesPerSecond: $bytesPerSecond
|
|
1226
1299
|
""".trimIndent())
|
|
1227
|
-
|
|
1228
|
-
|
|
1229
|
-
|
|
1230
|
-
|
|
1231
|
-
|
|
1232
|
-
|
|
1233
|
-
|
|
1300
|
+
|
|
1301
|
+
var audioDataBytes = ByteArray((endByte - startByte).coerceAtLeast(0))
|
|
1302
|
+
FileInputStream(file).use { fis ->
|
|
1303
|
+
fis.skip(startByte.toLong())
|
|
1304
|
+
fis.read(audioDataBytes)
|
|
1305
|
+
}
|
|
1306
|
+
|
|
1307
|
+
// Apply bit depth conversion if needed
|
|
1308
|
+
var effectiveBitDepth = format.bitDepth
|
|
1309
|
+
if (config.targetBitDepth != format.bitDepth) {
|
|
1310
|
+
audioDataBytes = AudioFormatUtils.convertBitDepth(
|
|
1311
|
+
audioDataBytes,
|
|
1312
|
+
format.bitDepth,
|
|
1313
|
+
config.targetBitDepth
|
|
1314
|
+
)
|
|
1315
|
+
effectiveBitDepth = config.targetBitDepth
|
|
1316
|
+
Log.d(Constants.TAG, "Converted bit depth from ${format.bitDepth} to ${config.targetBitDepth}")
|
|
1317
|
+
}
|
|
1318
|
+
|
|
1234
1319
|
return AudioData(
|
|
1235
|
-
data =
|
|
1236
|
-
sampleRate =
|
|
1237
|
-
channels =
|
|
1238
|
-
bitDepth =
|
|
1239
|
-
durationMs =
|
|
1320
|
+
data = audioDataBytes,
|
|
1321
|
+
sampleRate = format.sampleRate,
|
|
1322
|
+
channels = format.channels,
|
|
1323
|
+
bitDepth = effectiveBitDepth,
|
|
1324
|
+
durationMs = endTimeMs - startTimeMs
|
|
1240
1325
|
)
|
|
1241
1326
|
} catch (e: Exception) {
|
|
1242
1327
|
Log.e(Constants.TAG, "Failed to load WAV range: ${e.message}", e)
|
|
@@ -1245,16 +1330,16 @@ class AudioProcessor(private val filesDir: File) {
|
|
|
1245
1330
|
}
|
|
1246
1331
|
|
|
1247
1332
|
private fun loadCompressedAudioRange(
|
|
1248
|
-
|
|
1249
|
-
startTimeMs: Long
|
|
1250
|
-
endTimeMs: Long
|
|
1333
|
+
fileUri: String,
|
|
1334
|
+
startTimeMs: Long,
|
|
1335
|
+
endTimeMs: Long,
|
|
1251
1336
|
config: DecodingConfig
|
|
1252
1337
|
): AudioData? {
|
|
1253
1338
|
val extractor = MediaExtractor()
|
|
1254
1339
|
var decoder: MediaCodec? = null
|
|
1255
1340
|
|
|
1256
1341
|
try {
|
|
1257
|
-
extractor.setDataSource(file
|
|
1342
|
+
extractor.setDataSource(fileUri.removePrefix("file://"))
|
|
1258
1343
|
val format = extractor.getTrackFormat(0)
|
|
1259
1344
|
extractor.selectTrack(0)
|
|
1260
1345
|
|
|
@@ -1271,8 +1356,8 @@ class AudioProcessor(private val filesDir: File) {
|
|
|
1271
1356
|
Log.d("AudioProcessor", "Final duration: ${totalDurationMs}ms")
|
|
1272
1357
|
|
|
1273
1358
|
// Calculate valid time range
|
|
1274
|
-
val validStartMs = startTimeMs
|
|
1275
|
-
val validEndMs = endTimeMs
|
|
1359
|
+
val validStartMs = startTimeMs.coerceIn(0, totalDurationMs) ?: 0
|
|
1360
|
+
val validEndMs = endTimeMs.coerceIn(validStartMs, totalDurationMs) ?: totalDurationMs
|
|
1276
1361
|
val effectiveDurationMs = validEndMs - validStartMs
|
|
1277
1362
|
|
|
1278
1363
|
// Initialize decoder
|
|
@@ -1302,7 +1387,7 @@ class AudioProcessor(private val filesDir: File) {
|
|
|
1302
1387
|
- format: ${targetSampleRate}Hz, $targetChannels channels, $targetBitDepth-bit
|
|
1303
1388
|
""".trimIndent())
|
|
1304
1389
|
|
|
1305
|
-
val outputBuffer = ByteBuffer.
|
|
1390
|
+
val outputBuffer = ByteBuffer.allocate(totalBytes.toInt())
|
|
1306
1391
|
val bufferInfo = MediaCodec.BufferInfo()
|
|
1307
1392
|
var isEOS = false
|
|
1308
1393
|
|
|
@@ -1332,18 +1417,25 @@ class AudioProcessor(private val filesDir: File) {
|
|
|
1332
1417
|
// Handle output
|
|
1333
1418
|
val outputBufferId = decoder.dequeueOutputBuffer(bufferInfo, 10000)
|
|
1334
1419
|
if (outputBufferId >= 0) {
|
|
1335
|
-
val
|
|
1420
|
+
val decodedBuffer = decoder.getOutputBuffer(outputBufferId)!!
|
|
1336
1421
|
if (bufferInfo.size > 0) {
|
|
1337
|
-
|
|
1338
|
-
|
|
1339
|
-
|
|
1340
|
-
|
|
1341
|
-
|
|
1422
|
+
// Set buffer position and limit based on the decoded data
|
|
1423
|
+
decodedBuffer.position(bufferInfo.offset)
|
|
1424
|
+
decodedBuffer.limit(bufferInfo.offset + bufferInfo.size)
|
|
1425
|
+
|
|
1426
|
+
// Copy decoded data to our output buffer
|
|
1427
|
+
outputBuffer.put(decodedBuffer)
|
|
1342
1428
|
}
|
|
1343
1429
|
decoder.releaseOutputBuffer(outputBufferId, false)
|
|
1430
|
+
|
|
1431
|
+
// Check if we've reached the end
|
|
1432
|
+
if ((bufferInfo.flags and MediaCodec.BUFFER_FLAG_END_OF_STREAM) != 0) {
|
|
1433
|
+
isEOS = true
|
|
1434
|
+
}
|
|
1344
1435
|
}
|
|
1345
1436
|
}
|
|
1346
1437
|
|
|
1438
|
+
// Prepare the final byte array
|
|
1347
1439
|
outputBuffer.flip()
|
|
1348
1440
|
val audioData = ByteArray(outputBuffer.remaining())
|
|
1349
1441
|
outputBuffer.get(audioData)
|
|
@@ -1353,7 +1445,7 @@ class AudioProcessor(private val filesDir: File) {
|
|
|
1353
1445
|
sampleRate = targetSampleRate,
|
|
1354
1446
|
channels = targetChannels,
|
|
1355
1447
|
bitDepth = targetBitDepth,
|
|
1356
|
-
durationMs =
|
|
1448
|
+
durationMs = endTimeMs - startTimeMs // Use the actual time range
|
|
1357
1449
|
).also {
|
|
1358
1450
|
Log.d(Constants.TAG, "Loaded compressed audio with duration: ${effectiveDurationMs}ms")
|
|
1359
1451
|
}
|
|
@@ -1483,4 +1575,362 @@ class AudioProcessor(private val filesDir: File) {
|
|
|
1483
1575
|
// This will help ensure consistent format when joining sections
|
|
1484
1576
|
return audioData
|
|
1485
1577
|
}
|
|
1578
|
+
|
|
1579
|
+
// Add new function to process entire file
|
|
1580
|
+
fun processEntireFile(audioData: AudioData): Features {
|
|
1581
|
+
val samples = convertToFloatArray(audioData.data, audioData.bitDepth)
|
|
1582
|
+
|
|
1583
|
+
// Compute basic features for the entire file
|
|
1584
|
+
val sumSquares = samples.sumOf { it * it.toDouble() }.toFloat()
|
|
1585
|
+
val segmentLength = samples.size
|
|
1586
|
+
val zeroCrossings = countZeroCrossings(samples)
|
|
1587
|
+
val minAmplitude = samples.minOrNull() ?: 0f
|
|
1588
|
+
val maxAmplitude = samples.maxOrNull() ?: 0f
|
|
1589
|
+
|
|
1590
|
+
// Use existing computeFeatures with the entire file as one segment
|
|
1591
|
+
return computeFeatures(
|
|
1592
|
+
segmentData = samples,
|
|
1593
|
+
sampleRate = audioData.sampleRate.toFloat(),
|
|
1594
|
+
sumSquares = sumSquares,
|
|
1595
|
+
zeroCrossings = zeroCrossings,
|
|
1596
|
+
segmentLength = segmentLength,
|
|
1597
|
+
minAmplitude = minAmplitude,
|
|
1598
|
+
maxAmplitude = maxAmplitude,
|
|
1599
|
+
featureOptions = mapOf() // Dont compute complex features
|
|
1600
|
+
)
|
|
1601
|
+
}
|
|
1602
|
+
|
|
1603
|
+
private fun countZeroCrossings(data: FloatArray): Int {
|
|
1604
|
+
var crossings = 0
|
|
1605
|
+
for (i in 1 until data.size) {
|
|
1606
|
+
if (data[i - 1] * data[i] < 0) crossings++
|
|
1607
|
+
}
|
|
1608
|
+
return crossings
|
|
1609
|
+
}
|
|
1610
|
+
|
|
1611
|
+
private fun hzToMel(hz: Float): Float {
|
|
1612
|
+
return 2595f * log10(1f + hz / 700f)
|
|
1613
|
+
}
|
|
1614
|
+
|
|
1615
|
+
private fun melToHz(mel: Float): Float {
|
|
1616
|
+
return 700f * (10f.pow(mel / 2595f) - 1f)
|
|
1617
|
+
}
|
|
1618
|
+
|
|
1619
|
+
private fun applyHannWindow(samples: FloatArray): FloatArray {
|
|
1620
|
+
val output = FloatArray(samples.size)
|
|
1621
|
+
for (i in samples.indices) {
|
|
1622
|
+
val multiplier = 0.5f * (1f - cos(2f * PI.toFloat() * i / (samples.size - 1)))
|
|
1623
|
+
output[i] = samples[i] * multiplier
|
|
1624
|
+
}
|
|
1625
|
+
return output
|
|
1626
|
+
}
|
|
1627
|
+
|
|
1628
|
+
private fun computeMelSpectrogram(samples: FloatArray, sampleRate: Float): List<Float> {
|
|
1629
|
+
val (powerSpectrum, _) = prepareFFT(samples, sampleRate)
|
|
1630
|
+
val melFilters = computeMelFilterbank(
|
|
1631
|
+
numFilters = 128,
|
|
1632
|
+
powerSpectrumSize = powerSpectrum.size,
|
|
1633
|
+
sampleRate = sampleRate
|
|
1634
|
+
)
|
|
1635
|
+
|
|
1636
|
+
// Apply Mel filters to power spectrum
|
|
1637
|
+
return melFilters.map { filter ->
|
|
1638
|
+
var energy = 0f
|
|
1639
|
+
for (j in powerSpectrum.indices) {
|
|
1640
|
+
energy += powerSpectrum[j] * filter[j]
|
|
1641
|
+
}
|
|
1642
|
+
kotlin.math.ln(maxOf(energy, 1e-10f))
|
|
1643
|
+
}
|
|
1644
|
+
}
|
|
1645
|
+
|
|
1646
|
+
private fun computeChroma(samples: FloatArray, sampleRate: Float): List<Float> {
|
|
1647
|
+
val (_, magnitudeSpectrum) = prepareFFT(samples, sampleRate)
|
|
1648
|
+
val chroma = FloatArray(N_CHROMA) { 0f }
|
|
1649
|
+
val freqsPerBin = sampleRate / N_FFT
|
|
1650
|
+
|
|
1651
|
+
for (i in 0 until N_FFT / 2) {
|
|
1652
|
+
val freq = i * freqsPerBin
|
|
1653
|
+
if (freq > 0) {
|
|
1654
|
+
val pitchClass = (12 * log2(freq / 440.0) % 12).toInt()
|
|
1655
|
+
if (pitchClass in 0..11) {
|
|
1656
|
+
val magnitude = sqrt(magnitudeSpectrum[2 * i] * magnitudeSpectrum[2 * i] +
|
|
1657
|
+
(if (2 * i + 1 < magnitudeSpectrum.size) magnitudeSpectrum[2 * i + 1] else 0f) *
|
|
1658
|
+
magnitudeSpectrum[2 * i + 1])
|
|
1659
|
+
chroma[pitchClass] += magnitude
|
|
1660
|
+
}
|
|
1661
|
+
}
|
|
1662
|
+
}
|
|
1663
|
+
|
|
1664
|
+
return chroma.toList()
|
|
1665
|
+
}
|
|
1666
|
+
|
|
1667
|
+
private fun computeSpectralContrast(samples: FloatArray, sampleRate: Float): List<Float> {
|
|
1668
|
+
val (_, magnitudeSpectrum) = prepareFFT(samples, sampleRate)
|
|
1669
|
+
// ... rest of spectral contrast computation using magnitudeSpectrum ...
|
|
1670
|
+
// Implementation depends on your specific requirements
|
|
1671
|
+
return emptyList() // Placeholder
|
|
1672
|
+
}
|
|
1673
|
+
|
|
1674
|
+
private fun computeTonnetz(samples: FloatArray, sampleRate: Float): List<Float> {
|
|
1675
|
+
// First compute chroma features
|
|
1676
|
+
val chroma = computeChroma(samples, sampleRate)
|
|
1677
|
+
|
|
1678
|
+
// Tonnetz transformation matrix (6x12)
|
|
1679
|
+
val tonnetzMatrix = arrayOf(
|
|
1680
|
+
floatArrayOf(1f, 0f, 0f, 0f, 1f, 0f, 0f, 1f, 0f, 0f, 0f, 0f), // Perfect fifth
|
|
1681
|
+
floatArrayOf(0f, 1f, 0f, 0f, 0f, 1f, 0f, 0f, 1f, 0f, 0f, 0f, 0f), // Minor third
|
|
1682
|
+
floatArrayOf(0f, 0f, 1f, 0f, 0f, 0f, 1f, 0f, 0f, 1f, 0f, 0f), // Major third
|
|
1683
|
+
floatArrayOf(0f, 0f, 0f, 1f, 0f, 0f, 0f, 1f, 0f, 0f, 1f, 0f), // Perfect fifth
|
|
1684
|
+
floatArrayOf(0f, 0f, 0f, 0f, 1f, 0f, 0f, 0f, 1f, 0f, 0f, 0f, 0f, 0f, 1f, 0f), // Minor third
|
|
1685
|
+
floatArrayOf(1f, 0f, 0f, 0f, 0f, 1f, 0f, 0f, 0f, 1f, 0f, 0f) // Major third
|
|
1686
|
+
)
|
|
1687
|
+
|
|
1688
|
+
// Compute tonnetz features
|
|
1689
|
+
val tonnetz = mutableListOf<Float>()
|
|
1690
|
+
for (row in tonnetzMatrix) {
|
|
1691
|
+
var sum = 0f
|
|
1692
|
+
for (i in row.indices) {
|
|
1693
|
+
sum += row[i] * (chroma.getOrNull(i) ?: 0f)
|
|
1694
|
+
}
|
|
1695
|
+
tonnetz.add(sum)
|
|
1696
|
+
}
|
|
1697
|
+
|
|
1698
|
+
return tonnetz
|
|
1699
|
+
}
|
|
1700
|
+
|
|
1701
|
+
private fun nextPowerOfTwo(n: Int): Int {
|
|
1702
|
+
var value = 1
|
|
1703
|
+
while (value < n) {
|
|
1704
|
+
value *= 2
|
|
1705
|
+
}
|
|
1706
|
+
return value
|
|
1707
|
+
}
|
|
1708
|
+
|
|
1709
|
+
private fun estimatePitch(segment: FloatArray, sampleRate: Float): Float {
|
|
1710
|
+
if (segment.size < 2) return 0.0f
|
|
1711
|
+
|
|
1712
|
+
// Apply Hann window
|
|
1713
|
+
val windowed = applyHannWindow(segment)
|
|
1714
|
+
|
|
1715
|
+
// Pad for FFT - ensure length is power of 2 and sufficient for autocorrelation
|
|
1716
|
+
val fftLength = nextPowerOfTwo(segment.size * 2)
|
|
1717
|
+
val padded = FloatArray(fftLength) // Initialize with zeros
|
|
1718
|
+
windowed.copyInto(padded) // Copy windowed data into padded array
|
|
1719
|
+
|
|
1720
|
+
// Perform forward FFT
|
|
1721
|
+
val fft = FFT(fftLength)
|
|
1722
|
+
try {
|
|
1723
|
+
fft.realForward(padded)
|
|
1724
|
+
} catch (e: Exception) {
|
|
1725
|
+
Log.e("AudioProcessor", "FFT forward transform failed: ${e.message}")
|
|
1726
|
+
return 0.0f
|
|
1727
|
+
}
|
|
1728
|
+
|
|
1729
|
+
// Compute power spectrum
|
|
1730
|
+
val powerSpectrum = FloatArray(fftLength)
|
|
1731
|
+
try {
|
|
1732
|
+
// Handle DC and Nyquist components separately
|
|
1733
|
+
powerSpectrum[0] = padded[0] * padded[0]
|
|
1734
|
+
powerSpectrum[fftLength/2] = padded[1] * padded[1]
|
|
1735
|
+
|
|
1736
|
+
// Handle remaining frequencies
|
|
1737
|
+
for (i in 1 until fftLength/2) {
|
|
1738
|
+
val re = padded[2 * i]
|
|
1739
|
+
val im = padded[2 * i + 1]
|
|
1740
|
+
powerSpectrum[i] = re * re + im * im
|
|
1741
|
+
powerSpectrum[fftLength - i] = powerSpectrum[i] // Mirror for inverse FFT
|
|
1742
|
+
}
|
|
1743
|
+
} catch (e: Exception) {
|
|
1744
|
+
Log.e("AudioProcessor", "Power spectrum computation failed: ${e.message}")
|
|
1745
|
+
return 0.0f
|
|
1746
|
+
}
|
|
1747
|
+
|
|
1748
|
+
// Inverse FFT to get autocorrelation
|
|
1749
|
+
val autocorrelation = FloatArray(fftLength)
|
|
1750
|
+
try {
|
|
1751
|
+
fft.realInverse(powerSpectrum, autocorrelation)
|
|
1752
|
+
} catch (e: Exception) {
|
|
1753
|
+
Log.e("AudioProcessor", "FFT inverse transform failed: ${e.message}")
|
|
1754
|
+
return 0.0f
|
|
1755
|
+
}
|
|
1756
|
+
|
|
1757
|
+
// Normalize autocorrelation
|
|
1758
|
+
val normFactor = 1.0f / autocorrelation[0] // Normalize by zero-lag autocorrelation
|
|
1759
|
+
for (i in autocorrelation.indices) {
|
|
1760
|
+
autocorrelation[i] *= normFactor
|
|
1761
|
+
}
|
|
1762
|
+
|
|
1763
|
+
// Find the first peak within pitch range (50-500 Hz)
|
|
1764
|
+
val minLag = (sampleRate / 500.0f).toInt().coerceAtLeast(1)
|
|
1765
|
+
val maxLag = (sampleRate / 50.0f).toInt().coerceAtMost(autocorrelation.size - 1)
|
|
1766
|
+
|
|
1767
|
+
var maxCorr = -1.0f
|
|
1768
|
+
var pitchLag = 0
|
|
1769
|
+
|
|
1770
|
+
// Add peak picking criteria
|
|
1771
|
+
val threshold = 0.3f // Correlation threshold
|
|
1772
|
+
var isPeak = false
|
|
1773
|
+
|
|
1774
|
+
for (lag in minLag..maxLag) {
|
|
1775
|
+
if (lag > 0 && lag < autocorrelation.size - 1) {
|
|
1776
|
+
// Check if this point is a peak
|
|
1777
|
+
isPeak = autocorrelation[lag] > autocorrelation[lag - 1] &&
|
|
1778
|
+
autocorrelation[lag] > autocorrelation[lag + 1] &&
|
|
1779
|
+
autocorrelation[lag] > threshold
|
|
1780
|
+
|
|
1781
|
+
if (isPeak && autocorrelation[lag] > maxCorr) {
|
|
1782
|
+
maxCorr = autocorrelation[lag]
|
|
1783
|
+
pitchLag = lag
|
|
1784
|
+
}
|
|
1785
|
+
}
|
|
1786
|
+
}
|
|
1787
|
+
|
|
1788
|
+
return if (pitchLag > 0) sampleRate / pitchLag else 0.0f
|
|
1789
|
+
}
|
|
1790
|
+
|
|
1791
|
+
/**
|
|
1792
|
+
* Prepares FFT by applying Hann window, padding, and computing both power and magnitude spectra.
|
|
1793
|
+
* @param samples Input audio samples
|
|
1794
|
+
* @param sampleRate Sampling rate in Hz
|
|
1795
|
+
* @param fftLength FFT size (must be power of 2)
|
|
1796
|
+
* @return Pair of power spectrum and magnitude spectrum
|
|
1797
|
+
*/
|
|
1798
|
+
private fun prepareFFT(samples: FloatArray, sampleRate: Float, fftLength: Int = nextPowerOfTwo(samples.size.coerceAtLeast(2048))): Pair<FloatArray, FloatArray> {
|
|
1799
|
+
val windowed = applyHannWindow(samples)
|
|
1800
|
+
val padded = windowed.copyOf(fftLength)
|
|
1801
|
+
val fft = FFT(fftLength)
|
|
1802
|
+
fft.realForward(padded)
|
|
1803
|
+
|
|
1804
|
+
val magnitudeSpectrum = FloatArray(fftLength / 2 + 1)
|
|
1805
|
+
for (i in 0 until fftLength / 2) {
|
|
1806
|
+
val re = padded[2 * i]
|
|
1807
|
+
val im = padded[2 * i + 1]
|
|
1808
|
+
magnitudeSpectrum[i] = sqrt(re * re + im * im)
|
|
1809
|
+
}
|
|
1810
|
+
magnitudeSpectrum[fftLength / 2] = abs(padded[1])
|
|
1811
|
+
|
|
1812
|
+
val powerSpectrum = magnitudeSpectrum.map { it * it }.toFloatArray()
|
|
1813
|
+
return Pair(powerSpectrum, magnitudeSpectrum)
|
|
1814
|
+
}
|
|
1815
|
+
|
|
1816
|
+
data class AudioFormat(
|
|
1817
|
+
val sampleRate: Int,
|
|
1818
|
+
val channels: Int,
|
|
1819
|
+
val bitDepth: Int
|
|
1820
|
+
)
|
|
1821
|
+
|
|
1822
|
+
fun getAudioFormat(fileUri: String): AudioFormat? {
|
|
1823
|
+
val cleanUri = fileUri.removePrefix("file://")
|
|
1824
|
+
val file = File(cleanUri).takeIf { it.exists() } ?: File(filesDir, File(cleanUri).name).takeIf { it.exists() }
|
|
1825
|
+
?: run {
|
|
1826
|
+
Log.e(Constants.TAG, "File not found: $cleanUri")
|
|
1827
|
+
return null
|
|
1828
|
+
}
|
|
1829
|
+
|
|
1830
|
+
val extractor = MediaExtractor()
|
|
1831
|
+
try {
|
|
1832
|
+
extractor.setDataSource(file.absolutePath)
|
|
1833
|
+
val format = extractor.getTrackFormat(0)
|
|
1834
|
+
return AudioFormat(
|
|
1835
|
+
sampleRate = format.getInteger(MediaFormat.KEY_SAMPLE_RATE),
|
|
1836
|
+
channels = format.getInteger(MediaFormat.KEY_CHANNEL_COUNT),
|
|
1837
|
+
bitDepth = 16 // Most compressed formats decode to 16-bit PCM
|
|
1838
|
+
)
|
|
1839
|
+
} catch (e: Exception) {
|
|
1840
|
+
Log.e(Constants.TAG, "Failed to get audio format: ${e.message}")
|
|
1841
|
+
return null
|
|
1842
|
+
} finally {
|
|
1843
|
+
extractor.release()
|
|
1844
|
+
}
|
|
1845
|
+
}
|
|
1846
|
+
|
|
1847
|
+
/**
|
|
1848
|
+
* Gets the size of the audio file header.
|
|
1849
|
+
* For WAV files, this includes the RIFF header and all metadata chunks before the data chunk.
|
|
1850
|
+
* For other formats, this will return null as header size handling is format-specific.
|
|
1851
|
+
*
|
|
1852
|
+
* @param fileUri The URI of the audio file to analyze
|
|
1853
|
+
* @return The size of the header in bytes, or null if:
|
|
1854
|
+
* - The file is not a WAV file
|
|
1855
|
+
* - The file cannot be read
|
|
1856
|
+
* - The file format is invalid
|
|
1857
|
+
* - The data chunk cannot be found
|
|
1858
|
+
*
|
|
1859
|
+
* WAV File Structure:
|
|
1860
|
+
* - RIFF header (12 bytes)
|
|
1861
|
+
* - "RIFF" identifier (4 bytes)
|
|
1862
|
+
* - File size (4 bytes)
|
|
1863
|
+
* - "WAVE" identifier (4 bytes)
|
|
1864
|
+
* - Format chunk ("fmt ") (24 bytes typically)
|
|
1865
|
+
* - Optional metadata chunks (variable size)
|
|
1866
|
+
* - LIST (metadata like artist, title)
|
|
1867
|
+
* - JUNK (padding)
|
|
1868
|
+
* - fact (additional format info)
|
|
1869
|
+
* - cue (cue points)
|
|
1870
|
+
* - Data chunk
|
|
1871
|
+
* - "data" identifier (4 bytes)
|
|
1872
|
+
* - Chunk size (4 bytes)
|
|
1873
|
+
* - Actual audio data
|
|
1874
|
+
*/
|
|
1875
|
+
fun getWavHeaderSize(fileUri: String): Int? {
|
|
1876
|
+
val cleanUri = fileUri.removePrefix("file://")
|
|
1877
|
+
val file = File(cleanUri).takeIf { it.exists() } ?: File(filesDir, File(cleanUri).name).takeIf { it.exists() }
|
|
1878
|
+
?: run {
|
|
1879
|
+
Log.e(Constants.TAG, "File not found: $cleanUri")
|
|
1880
|
+
return null
|
|
1881
|
+
}
|
|
1882
|
+
|
|
1883
|
+
try {
|
|
1884
|
+
val inputStream = FileInputStream(file)
|
|
1885
|
+
val buffer = ByteArray(12) // Read RIFF header and chunk size
|
|
1886
|
+
|
|
1887
|
+
// Read RIFF header
|
|
1888
|
+
if (inputStream.read(buffer) != 12) {
|
|
1889
|
+
Log.e(Constants.TAG, "Failed to read RIFF header")
|
|
1890
|
+
return null
|
|
1891
|
+
}
|
|
1892
|
+
|
|
1893
|
+
// Verify RIFF header
|
|
1894
|
+
if (String(buffer, 0, 4) != "RIFF" || String(buffer, 8, 4) != "WAVE") {
|
|
1895
|
+
Log.e(Constants.TAG, "Invalid WAV file format")
|
|
1896
|
+
return null
|
|
1897
|
+
}
|
|
1898
|
+
|
|
1899
|
+
var headerSize = 12
|
|
1900
|
+
var chunkSize: Int
|
|
1901
|
+
|
|
1902
|
+
// Read chunks until we find the data chunk
|
|
1903
|
+
while (true) {
|
|
1904
|
+
if (inputStream.read(buffer, 0, 8) != 8) {
|
|
1905
|
+
Log.e(Constants.TAG, "Unexpected end of file while reading chunks")
|
|
1906
|
+
break
|
|
1907
|
+
}
|
|
1908
|
+
|
|
1909
|
+
chunkSize = (buffer[7].toInt() and 0xFF shl 24) or
|
|
1910
|
+
(buffer[6].toInt() and 0xFF shl 16) or
|
|
1911
|
+
(buffer[5].toInt() and 0xFF shl 8) or
|
|
1912
|
+
(buffer[4].toInt() and 0xFF)
|
|
1913
|
+
|
|
1914
|
+
val chunkId = String(buffer, 0, 4)
|
|
1915
|
+
Log.d(Constants.TAG, "Found chunk: $chunkId, size: $chunkSize")
|
|
1916
|
+
|
|
1917
|
+
if (chunkId == "data") {
|
|
1918
|
+
headerSize += 8 // Add chunk header size
|
|
1919
|
+
Log.d(Constants.TAG, "Found data chunk at offset: $headerSize")
|
|
1920
|
+
break
|
|
1921
|
+
}
|
|
1922
|
+
|
|
1923
|
+
headerSize += 8 + chunkSize // Add chunk header and data size
|
|
1924
|
+
inputStream.skip(chunkSize.toLong()) // Skip chunk data
|
|
1925
|
+
}
|
|
1926
|
+
|
|
1927
|
+
inputStream.close()
|
|
1928
|
+
Log.d(Constants.TAG, "Total WAV header size: $headerSize bytes")
|
|
1929
|
+
return headerSize
|
|
1930
|
+
|
|
1931
|
+
} catch (e: Exception) {
|
|
1932
|
+
Log.e(Constants.TAG, "Error calculating WAV header size: ${e.message}")
|
|
1933
|
+
return null
|
|
1934
|
+
}
|
|
1935
|
+
}
|
|
1486
1936
|
}
|