@siteed/expo-audio-stream 1.17.0 → 2.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (74) hide show
  1. package/CHANGELOG.md +26 -1
  2. package/README.md +1 -1
  3. package/android/src/main/java/net/siteed/audiostream/AudioAnalysisData.kt +68 -22
  4. package/android/src/main/java/net/siteed/audiostream/AudioFormatUtils.kt +24 -0
  5. package/android/src/main/java/net/siteed/audiostream/AudioProcessor.kt +836 -386
  6. package/android/src/main/java/net/siteed/audiostream/AudioRecorderManager.kt +0 -2
  7. package/android/src/main/java/net/siteed/audiostream/AudioRecordingService.kt +35 -29
  8. package/android/src/main/java/net/siteed/audiostream/ExpoAudioStreamModule.kt +236 -96
  9. package/android/src/main/java/net/siteed/audiostream/FFT.kt +55 -0
  10. package/android/src/main/java/net/siteed/audiostream/Features.kt +49 -7
  11. package/android/src/main/java/net/siteed/audiostream/RecordingConfig.kt +2 -4
  12. package/build/AudioAnalysis/AudioAnalysis.types.d.ts +55 -47
  13. package/build/AudioAnalysis/AudioAnalysis.types.d.ts.map +1 -1
  14. package/build/AudioAnalysis/AudioAnalysis.types.js.map +1 -1
  15. package/build/AudioAnalysis/extractAudioAnalysis.d.ts +60 -13
  16. package/build/AudioAnalysis/extractAudioAnalysis.d.ts.map +1 -1
  17. package/build/AudioAnalysis/extractAudioAnalysis.js +147 -162
  18. package/build/AudioAnalysis/extractAudioAnalysis.js.map +1 -1
  19. package/build/ExpoAudioStream.types.d.ts +47 -3
  20. package/build/ExpoAudioStream.types.d.ts.map +1 -1
  21. package/build/ExpoAudioStream.types.js.map +1 -1
  22. package/build/ExpoAudioStream.web.d.ts.map +1 -1
  23. package/build/ExpoAudioStream.web.js +0 -1
  24. package/build/ExpoAudioStream.web.js.map +1 -1
  25. package/build/ExpoAudioStreamModule.d.ts.map +1 -1
  26. package/build/ExpoAudioStreamModule.js +216 -12
  27. package/build/ExpoAudioStreamModule.js.map +1 -1
  28. package/build/WebRecorder.web.d.ts +67 -13
  29. package/build/WebRecorder.web.d.ts.map +1 -1
  30. package/build/WebRecorder.web.js +177 -173
  31. package/build/WebRecorder.web.js.map +1 -1
  32. package/build/index.d.ts +3 -3
  33. package/build/index.d.ts.map +1 -1
  34. package/build/index.js +2 -2
  35. package/build/index.js.map +1 -1
  36. package/build/useAudioRecorder.d.ts.map +1 -1
  37. package/build/useAudioRecorder.js +12 -8
  38. package/build/useAudioRecorder.js.map +1 -1
  39. package/build/utils/audioProcessing.d.ts +24 -0
  40. package/build/utils/audioProcessing.d.ts.map +1 -0
  41. package/build/utils/audioProcessing.js +133 -0
  42. package/build/utils/audioProcessing.js.map +1 -0
  43. package/build/workers/InlineFeaturesExtractor.web.d.ts +1 -1
  44. package/build/workers/InlineFeaturesExtractor.web.d.ts.map +1 -1
  45. package/build/workers/InlineFeaturesExtractor.web.js +694 -194
  46. package/build/workers/InlineFeaturesExtractor.web.js.map +1 -1
  47. package/build/workers/inlineAudioWebWorker.web.d.ts +1 -1
  48. package/build/workers/inlineAudioWebWorker.web.d.ts.map +1 -1
  49. package/build/workers/inlineAudioWebWorker.web.js +3 -2
  50. package/build/workers/inlineAudioWebWorker.web.js.map +1 -1
  51. package/ios/AudioAnalysisData.swift +51 -16
  52. package/ios/AudioProcessingHelpers.swift +710 -26
  53. package/ios/AudioProcessor.swift +334 -185
  54. package/ios/AudioStreamManager.swift +2 -3
  55. package/ios/DataPoint.swift +25 -12
  56. package/ios/DecodingConfig.swift +47 -0
  57. package/ios/ExpoAudioStreamModule.swift +187 -103
  58. package/ios/FFT.swift +62 -0
  59. package/ios/Features.swift +24 -3
  60. package/ios/RecordingSettings.swift +7 -7
  61. package/package.json +2 -1
  62. package/plugin/build/index.js +6 -1
  63. package/plugin/src/index.ts +9 -1
  64. package/src/AudioAnalysis/AudioAnalysis.types.ts +68 -52
  65. package/src/AudioAnalysis/extractAudioAnalysis.ts +223 -219
  66. package/src/ExpoAudioStream.types.ts +53 -7
  67. package/src/ExpoAudioStream.web.ts +0 -1
  68. package/src/ExpoAudioStreamModule.ts +255 -10
  69. package/src/WebRecorder.web.ts +231 -244
  70. package/src/index.ts +5 -3
  71. package/src/useAudioRecorder.tsx +14 -10
  72. package/src/utils/audioProcessing.ts +205 -0
  73. package/src/workers/InlineFeaturesExtractor.web.tsx +694 -194
  74. package/src/workers/inlineAudioWebWorker.web.tsx +3 -2
@@ -1,4 +1,4 @@
1
- // net/siteed/audiostream/AudioProcessor.kt
1
+ // packages/expo-audio-stream/android/src/main/java/net/siteed/audiostream/AudioProcessor.kt
2
2
  package net.siteed.audiostream
3
3
 
4
4
  import java.nio.ByteBuffer
@@ -6,15 +6,14 @@ import java.nio.ByteOrder
6
6
  import kotlin.math.*
7
7
  import android.util.Log
8
8
  import java.io.File
9
- import java.io.IOException
10
9
  import java.util.concurrent.atomic.AtomicLong
11
10
  import kotlin.system.measureTimeMillis
12
11
  import android.media.MediaExtractor
13
12
  import android.media.MediaFormat
14
13
  import android.media.MediaCodec
15
14
  import java.io.FileInputStream
16
- import java.nio.channels.FileChannel
17
15
  import java.io.RandomAccessFile
16
+ import java.util.zip.CRC32
18
17
 
19
18
  data class DecodingConfig(
20
19
  val targetSampleRate: Int? = null, // Optional target sample rate
@@ -25,13 +24,9 @@ data class DecodingConfig(
25
24
 
26
25
  class AudioProcessor(private val filesDir: File) {
27
26
  companion object {
28
- const val NUM_MFCC_COEFFICIENTS = 13
29
- const val NUM_MEL_FILTERS = 26
30
- const val MEL_MIN_FREQ = 0.0
31
- const val MEL_MAX_FREQ_DIVISOR = 2595.0
32
- const val MEL_MAX_FREQ_CONSTANT = 700.0
33
27
  const val DCT_SQRT_DIVISOR = 2.0
34
- const val LOG_BASE = 10.0
28
+ private const val N_FFT = 1024
29
+ private const val N_CHROMA = 12
35
30
 
36
31
  private val uniqueIdCounter = AtomicLong(0L) // Keep as companion object property to maintain during pause/resume cycles
37
32
 
@@ -45,7 +40,7 @@ class AudioProcessor(private val filesDir: File) {
45
40
  private var cumulativeMinAmplitude = Float.MAX_VALUE
46
41
  private var cumulativeMaxAmplitude = Float.NEGATIVE_INFINITY
47
42
 
48
- fun loadAudioFile(filePath: String, debug: Boolean = false): AudioData? {
43
+ private fun loadAudioFile(filePath: String): AudioData? {
49
44
  try {
50
45
  val fileUri = filePath.removePrefix("file://")
51
46
  Log.d("AudioProcessor", "Processing WAV file: $fileUri")
@@ -66,10 +61,6 @@ class AudioProcessor(private val filesDir: File) {
66
61
  return null
67
62
  }
68
63
 
69
- // Read file size (4 bytes little-endian)
70
- val fileSizeBytes = ByteArray(4).apply { raf.readFully(this) }
71
- val expectedFileSize = ByteBuffer.wrap(fileSizeBytes).order(ByteOrder.LITTLE_ENDIAN).int + 8L
72
-
73
64
  // Read WAVE header
74
65
  val waveHeader = ByteArray(4).apply { raf.readFully(this) }
75
66
  if (String(waveHeader) != "WAVE") {
@@ -180,18 +171,6 @@ class AudioProcessor(private val filesDir: File) {
180
171
  }
181
172
  }
182
173
 
183
- private fun byteArrayToInt(bytes: ByteArray): Int {
184
- return (bytes[0].toInt() and 0xFF) or
185
- ((bytes[1].toInt() and 0xFF) shl 8) or
186
- ((bytes[2].toInt() and 0xFF) shl 16) or
187
- ((bytes[3].toInt() and 0xFF) shl 24)
188
- }
189
-
190
- private fun byteArrayToShort(bytes: ByteArray): Short {
191
- return (bytes[0].toInt() and 0xFF or
192
- (bytes[1].toInt() and 0xFF shl 8)).toShort()
193
- }
194
-
195
174
  /**
196
175
  * Processes the audio data and extracts features.
197
176
  * @param data The audio data in bytes.
@@ -199,6 +178,22 @@ class AudioProcessor(private val filesDir: File) {
199
178
  * @return AudioAnalysisData containing the extracted features.
200
179
  */
201
180
  fun processAudioData(data: ByteArray, config: RecordingConfig): AudioAnalysisData {
181
+ if (data.isEmpty()) {
182
+ Log.e("AudioProcessor", "Received empty audio data")
183
+ return AudioAnalysisData(
184
+ segmentDurationMs = config.segmentDurationMs,
185
+ durationMs = 0,
186
+ bitDepth = 16,
187
+ numberOfChannels = config.channels,
188
+ sampleRate = config.sampleRate,
189
+ samples = 0,
190
+ dataPoints = emptyList(),
191
+ amplitudeRange = AudioAnalysisData.AmplitudeRange(0f, 0f),
192
+ rmsRange = AudioAnalysisData.AmplitudeRange(0f, 0f),
193
+ extractionTimeMs = 0f,
194
+ )
195
+ }
196
+
202
197
  val sampleRate = config.sampleRate.toFloat()
203
198
  val bitDepth = when (config.encoding) {
204
199
  "pcm_8bit" -> 8
@@ -207,34 +202,33 @@ class AudioProcessor(private val filesDir: File) {
207
202
  else -> throw IllegalArgumentException("Unsupported encoding: ${config.encoding}")
208
203
  }
209
204
  val channelData = convertToFloatArray(data, bitDepth)
210
- val pointsPerSecond = config.pointsPerSecond
211
- val algorithm = config.algorithm
212
205
  val featureOptions = config.features
213
206
 
214
207
  val totalSamples = channelData.size
215
- val segmentDurationSeconds = totalSamples.toDouble() / sampleRate
216
- val totalPoints = max((segmentDurationSeconds * pointsPerSecond).toInt(), 1)
217
- val pointInterval = ceil(totalSamples / totalPoints.toDouble()).toInt()
218
-
219
- Log.d("AudioProcessor", "Extracting waveform totalSize=${data.size} with $totalSamples samples and $pointsPerSecond points per second --> $pointInterval samples per point")
220
- Log.d("AudioProcessor", "segmentDuration: $segmentDurationSeconds seconds")
208
+ // Update samplesPerSegment calculation to use proper formula
209
+ val samplesPerSegment = ((config.segmentDurationMs / 1000.0) * sampleRate).toInt()
210
+ val totalPoints = ceil(totalSamples.toDouble() / samplesPerSegment).toInt()
211
+
212
+ Log.d("AudioProcessor", "Extracting waveform totalSize=${data.size} with $totalSamples samples --> $totalPoints points")
213
+ Log.d("AudioProcessor", "segmentDuration: ${config.segmentDurationMs}ms, samplesPerSegment: $samplesPerSegment")
221
214
 
222
- val expectedPoints = segmentDurationSeconds * pointsPerSecond
223
- val samplesPerPoint = ceil(channelData.size / expectedPoints).toInt()
224
- Log.d("AudioProcessor", "Extracting waveform with expectedPoints=$expectedPoints , samplesPerPoints=$samplesPerPoint")
215
+ // Remove expectedPoints calculation since it used pointsPerSecond
216
+ val samplesPerPoint = ceil(channelData.size / totalPoints.toDouble()).toInt()
217
+ Log.d("AudioProcessor", "Extracting waveform with samplesPerPoints=$samplesPerPoint")
225
218
 
226
219
  val dataPoints = mutableListOf<DataPoint>()
227
220
  var minAmplitude = Float.MAX_VALUE
228
221
  var maxAmplitude = Float.NEGATIVE_INFINITY
229
- val durationMs = (segmentDurationSeconds * 1000).toInt()
222
+ var minRms = Float.MAX_VALUE
223
+ var maxRms = Float.NEGATIVE_INFINITY
224
+ // Calculate total duration in milliseconds based on sample rate and total samples
225
+ val durationMs = (totalSamples.toFloat() / sampleRate * 1000).toInt()
230
226
 
231
227
  // Measure the time taken for audio processing
232
228
  val extractionTimeMs = measureTimeMillis {
233
- var currentPosition = 0 // Track the current byte position
234
-
235
229
  for (i in 0 until totalPoints) {
236
- val start = i * samplesPerPoint
237
- val end = min(start + samplesPerPoint, totalSamples)
230
+ val start = i * samplesPerSegment
231
+ val end = min(start + samplesPerSegment, totalSamples)
238
232
  val segmentData = channelData.sliceArray(start until end)
239
233
 
240
234
  var sumSquares = 0f
@@ -253,12 +247,23 @@ class AudioProcessor(private val filesDir: File) {
253
247
  localMaxAmplitude = max(localMaxAmplitude, absValue)
254
248
  }
255
249
 
256
- val features = computeFeatures(segmentData, sampleRate, minAmplitude, maxAmplitude, sumSquares, zeroCrossings, segmentData.size, featureOptions)
250
+ val features = computeFeatures(
251
+ segmentData = segmentData,
252
+ sampleRate = sampleRate,
253
+ sumSquares = sumSquares,
254
+ zeroCrossings = zeroCrossings,
255
+ segmentLength = segmentData.size,
256
+ featureOptions = featureOptions,
257
+ minAmplitude = localMinAmplitude,
258
+ maxAmplitude = localMaxAmplitude
259
+ )
257
260
  val rms = features.rms
258
261
  val silent = rms < 0.01
259
- val dB = if (featureOptions["dB"] == true) 20 * log10(rms.toDouble()).toFloat() else 0f
262
+ val dB = 20 * log10(rms.toDouble()).toFloat()
260
263
  minAmplitude = min(minAmplitude, localMinAmplitude)
261
264
  maxAmplitude = max(maxAmplitude, localMaxAmplitude)
265
+ minRms = min(minRms, rms)
266
+ maxRms = max(maxRms, rms)
262
267
 
263
268
  val bytesPerSample = bitDepth / 8
264
269
  val startPosition = start * bytesPerSample * config.channels
@@ -269,18 +274,18 @@ class AudioProcessor(private val filesDir: File) {
269
274
  cumulativeMaxAmplitude = max(cumulativeMaxAmplitude, localMaxAmplitude)
270
275
 
271
276
  val dataPoint = DataPoint(
272
- id = uniqueIdCounter.getAndIncrement(), // Assign unique ID and increment the counter
273
- amplitude = if (algorithm == "peak") localMaxAmplitude else rms,
274
- activeSpeech = null,
277
+ id = uniqueIdCounter.getAndIncrement(),
278
+ amplitude = localMaxAmplitude, // Always use peak amplitude
279
+ rms = rms, // Always include RMS
275
280
  dB = dB,
276
281
  silent = silent,
277
282
  features = features,
278
- samples = segmentData.size,
283
+ speech = SpeechFeatures(isActive = !silent),
279
284
  startTime = startPosition / (sampleRate * bytesPerSample * config.channels),
280
285
  endTime = endPosition / (sampleRate * bytesPerSample * config.channels),
281
286
  startPosition = startPosition,
282
287
  endPosition = endPosition,
283
- speaker = 0
288
+ samples = segmentData.size
284
289
  )
285
290
 
286
291
  dataPoints.add(dataPoint)
@@ -288,16 +293,16 @@ class AudioProcessor(private val filesDir: File) {
288
293
  }
289
294
 
290
295
  return AudioAnalysisData(
291
- pointsPerSecond = pointsPerSecond,
296
+ segmentDurationMs = config.segmentDurationMs,
292
297
  durationMs = durationMs,
293
298
  bitDepth = bitDepth,
294
299
  numberOfChannels = config.channels,
295
- sampleRate = config.sampleRate,
296
- samples = totalSamples,
300
+ sampleRate = config.sampleRate, // Use config.sampleRate instead of sampleRate
301
+ samples = totalSamples, // Use totalSamples instead of samplesInRange
297
302
  dataPoints = dataPoints,
298
- amplitudeRange = AudioAnalysisData.AmplitudeRange(cumulativeMinAmplitude, cumulativeMaxAmplitude),
299
- speakerChanges = emptyList(),
300
- extractionTimeMs = extractionTimeMs.toFloat() // Return the measured extraction time
303
+ amplitudeRange = AudioAnalysisData.AmplitudeRange(minAmplitude, maxAmplitude),
304
+ rmsRange = AudioAnalysisData.AmplitudeRange(minRms, maxRms),
305
+ extractionTimeMs = extractionTimeMs.toFloat()
301
306
  )
302
307
  }
303
308
 
@@ -358,45 +363,33 @@ class AudioProcessor(private val filesDir: File) {
358
363
  val zcr = if (featureOptions["zcr"] == true) zeroCrossings / segmentLength.toFloat() else 0f
359
364
 
360
365
  val mfcc = try {
361
- if (featureOptions["mfcc"] == true) extractMFCC(segmentData, sampleRate) else emptyList()
366
+ if (featureOptions["mfcc"] == true) computeMFCC(segmentData, sampleRate) else emptyList()
362
367
  } catch (e: Exception) {
363
368
  Log.e("AudioProcessor", "Failed to extract MFCC: ${e.message}", e)
364
369
  emptyList()
365
370
  }
366
371
 
367
- val spectralCentroid = try {
368
- if (featureOptions["spectralCentroid"] == true) extractSpectralCentroid(segmentData, sampleRate) else 0f
369
- } catch (e: Exception) {
370
- Log.e("AudioProcessor", "Failed to extract spectral centroid: ${e.message}", e)
371
- 0f
372
- }
373
-
374
- val spectralFlatness = try {
375
- if (featureOptions["spectralFlatness"] == true) extractSpectralFlatness(segmentData) else 0f
376
- } catch (e: Exception) {
377
- Log.e("AudioProcessor", "Failed to extract spectral flatness: ${e.message}", e)
378
- 0f
379
- }
380
-
381
- val spectralRollOff = try {
382
- if (featureOptions["spectralRollOff"] == true) extractSpectralRollOff(segmentData, sampleRate) else 0f
372
+ val melSpectrogram = try {
373
+ if (featureOptions["melSpectrogram"] == true) computeMelSpectrogram(segmentData, sampleRate) else emptyList()
383
374
  } catch (e: Exception) {
384
- Log.e("AudioProcessor", "Failed to extract spectral roll-off: ${e.message}", e)
385
- 0f
375
+ Log.e("AudioProcessor", "Failed to compute mel spectrogram: ${e.message}", e)
376
+ emptyList()
386
377
  }
387
378
 
388
- val spectralBandwidth = try {
389
- if (featureOptions["spectralBandwidth"] == true) extractSpectralBandwidth(segmentData, sampleRate) else 0f
379
+ val chroma = try {
380
+ if (featureOptions["chromagram"] == true) computeChroma(segmentData, sampleRate) else emptyList()
390
381
  } catch (e: Exception) {
391
- Log.e("AudioProcessor", "Failed to extract spectral bandwidth: ${e.message}", e)
392
- 0f
382
+ Log.e("AudioProcessor", "Failed to compute chroma: ${e.message}", e)
383
+ emptyList()
393
384
  }
394
385
 
395
- val chromagram = try {
396
- if (featureOptions["chromagram"] == true) extractChromagram(segmentData, sampleRate) else emptyList()
397
- } catch (e: Exception) {
398
- Log.e("AudioProcessor", "Failed to extract chromagram: ${e.message}", e)
399
- emptyList()
386
+ val spectralFeatures = if (featureOptions["spectralCentroid"] == true ||
387
+ featureOptions["spectralFlatness"] == true ||
388
+ featureOptions["spectralRollOff"] == true ||
389
+ featureOptions["spectralBandwidth"] == true) {
390
+ extractSpectralFeatures(segmentData, sampleRate)
391
+ } else {
392
+ SpectralFeatures()
400
393
  }
401
394
 
402
395
  val tempo = try {
@@ -413,23 +406,220 @@ class AudioProcessor(private val filesDir: File) {
413
406
  0f
414
407
  }
415
408
 
409
+ val spectralContrast = try {
410
+ if (featureOptions["spectralContrast"] == true) computeSpectralContrast(segmentData, sampleRate) else emptyList()
411
+ } catch (e: Exception) {
412
+ Log.e("AudioProcessor", "Failed to compute spectral contrast: ${e.message}", e)
413
+ emptyList()
414
+ }
415
+
416
+ val tonnetz = try {
417
+ if (featureOptions["tonnetz"] == true) computeTonnetz(segmentData, sampleRate) else emptyList()
418
+ } catch (e: Exception) {
419
+ Log.e("AudioProcessor", "Failed to compute tonnetz: ${e.message}", e)
420
+ emptyList()
421
+ }
422
+
423
+ val pitch = if (featureOptions["pitch"] == true) estimatePitch(segmentData, sampleRate) else 0.0f
424
+
425
+ val crc32Value = if (featureOptions["crc32"] == true) {
426
+ val byteBuffer = ByteBuffer.allocate(segmentData.size * 4)
427
+ .order(ByteOrder.LITTLE_ENDIAN)
428
+ segmentData.forEach { value ->
429
+ byteBuffer.putFloat(value)
430
+ }
431
+
432
+ val crc32 = CRC32()
433
+ crc32.update(byteBuffer.array())
434
+ crc32.value
435
+ } else null
436
+
416
437
  return Features(
417
438
  energy = energy,
418
439
  mfcc = mfcc,
419
440
  rms = rms,
420
- zcr = zcr,
421
441
  minAmplitude = minAmplitude,
422
442
  maxAmplitude = maxAmplitude,
423
- spectralCentroid = spectralCentroid,
424
- spectralFlatness = spectralFlatness,
425
- spectralRollOff = spectralRollOff,
426
- spectralBandwidth = spectralBandwidth,
427
- chromagram = chromagram,
443
+ zcr = zcr,
444
+ spectralCentroid = spectralFeatures.centroid,
445
+ spectralFlatness = spectralFeatures.flatness,
446
+ spectralRollOff = spectralFeatures.rollOff,
447
+ spectralBandwidth = spectralFeatures.bandwidth,
428
448
  tempo = tempo,
429
- hnr = hnr
449
+ hnr = hnr,
450
+ melSpectrogram = melSpectrogram,
451
+ chromagram = chroma,
452
+ spectralContrast = spectralContrast,
453
+ tonnetz = tonnetz,
454
+ pitch = pitch,
455
+ crc32 = crc32Value
456
+ )
457
+ }
458
+
459
+ private fun extractTempo(segmentData: FloatArray, sampleRate: Float): Float {
460
+ val hopLength = 512
461
+ val frameLength = 2048
462
+
463
+ // Compute onset strength signal using spectral flux
464
+ val onsetEnvelope = mutableListOf<Float>()
465
+ var previousSpectrum = FloatArray(frameLength / 2)
466
+
467
+ // Process frames with spectral flux
468
+ for (i in 0 until segmentData.size - frameLength step hopLength) {
469
+ val frame = segmentData.slice(i until minOf(i + frameLength, segmentData.size)).toFloatArray()
470
+ val fft = FFT(frameLength)
471
+ val fftData = frame.copyOf(frameLength)
472
+ fft.realForward(fftData)
473
+
474
+ // Compute magnitude spectrum
475
+ val magnitudes = FloatArray(frameLength / 2)
476
+ for (j in magnitudes.indices) {
477
+ val re = fftData[2 * j]
478
+ val im = if (2 * j + 1 < fftData.size) fftData[2 * j + 1] else 0f
479
+ magnitudes[j] = sqrt(re * re + im * im)
480
+ }
481
+
482
+ // Calculate spectral flux (sum of positive differences)
483
+ var flux = 0f
484
+ for (j in magnitudes.indices) {
485
+ flux += maxOf(magnitudes[j] - previousSpectrum[j], 0f)
486
+ }
487
+ onsetEnvelope.add(flux)
488
+ previousSpectrum = magnitudes
489
+ }
490
+
491
+ // Find peaks in onset envelope
492
+ val peaks = mutableListOf<Int>()
493
+ for (i in 1 until onsetEnvelope.size - 1) {
494
+ if (onsetEnvelope[i] > onsetEnvelope[i-1] && onsetEnvelope[i] > onsetEnvelope[i+1]) {
495
+ peaks.add(i)
496
+ }
497
+ }
498
+
499
+ // Calculate tempo from peak intervals
500
+ return if (peaks.size > 1) {
501
+ val intervals = peaks.zipWithNext { a, b -> b - a }
502
+ val averageInterval = intervals.average().toFloat()
503
+ 60f * sampleRate / (hopLength * averageInterval)
504
+ } else {
505
+ 120f // Default tempo if no clear peaks found
506
+ }
507
+ }
508
+
509
+ private fun extractSpectralFeatures(samples: FloatArray, sampleRate: Float): SpectralFeatures {
510
+ // FFT requires a fixed-size buffer (N_FFT). If our input is larger,
511
+ // we'll analyze just the first N_FFT samples to prevent buffer overflow.
512
+ // This is a common practice in audio analysis where we process chunks
513
+ // of consistent size rather than variable-length segments.
514
+ val windowed = if (samples.size > N_FFT) {
515
+ // If samples are larger than FFT size, take the first N_FFT samples
516
+ applyHannWindow(samples.copyOf(N_FFT))
517
+ } else {
518
+ applyHannWindow(samples)
519
+ }
520
+
521
+ // Create padded array for FFT, ensuring we don't exceed N_FFT size
522
+ // Zero padding is automatic since FloatArray initializes with zeros
523
+ val paddedSamples = FloatArray(N_FFT).also { padded ->
524
+ windowed.copyInto(padded, 0, 0, minOf(windowed.size, N_FFT))
525
+ }
526
+
527
+ // Perform FFT
528
+ val fft = FFT(N_FFT)
529
+ fft.realForward(paddedSamples)
530
+
531
+ // Calculate magnitude spectrum (only need first half due to symmetry)
532
+ // Add 1 to include both DC (0 Hz) and Nyquist frequency components
533
+ val magnitudeSpectrum = FloatArray(N_FFT / 2 + 1)
534
+ for (i in 0 until N_FFT / 2) { // Since we're only going up to N_FFT/2, the check is unnecessary
535
+ val re = paddedSamples[2 * i]
536
+ val im = paddedSamples[2 * i + 1] // This will always be within bounds
537
+ magnitudeSpectrum[i] = sqrt(re * re + im * im)
538
+ }
539
+ // Handle Nyquist frequency component separately
540
+ magnitudeSpectrum[N_FFT / 2] = abs(paddedSamples[1])
541
+
542
+ // Compute power spectrum for spectral flatness
543
+ val powerSpectrum = magnitudeSpectrum.map { it * it }.toFloatArray()
544
+
545
+ // Compute spectral features
546
+ val centroid = computeSpectralCentroid(magnitudeSpectrum, sampleRate)
547
+ val flatness = computeSpectralFlatness(powerSpectrum)
548
+ val rollOff = computeSpectralRollOff(magnitudeSpectrum, sampleRate)
549
+ val bandwidth = computeSpectralBandwidth(magnitudeSpectrum, sampleRate, centroid)
550
+
551
+ return SpectralFeatures(
552
+ centroid = centroid,
553
+ flatness = flatness,
554
+ rollOff = rollOff,
555
+ bandwidth = bandwidth
430
556
  )
431
557
  }
432
558
 
559
+ private fun computeSpectralCentroid(magnitudeSpectrum: FloatArray, sampleRate: Float): Float {
560
+ val sum = magnitudeSpectrum.sum()
561
+ if (sum == 0f) return 0f
562
+
563
+ val weightedSum = magnitudeSpectrum.mapIndexed { index, value ->
564
+ index * (sampleRate / N_FFT) * value
565
+ }.sum()
566
+
567
+ return weightedSum / sum
568
+ }
569
+
570
+ private fun computeSpectralFlatness(powerSpectrum: FloatArray): Float {
571
+ // Calculate geometric mean using log-space to avoid numerical issues
572
+ var sumLogValues = 0.0f
573
+ for (value in powerSpectrum) {
574
+ sumLogValues += ln(value + 1e-10f) // Add small epsilon to avoid log(0)
575
+ }
576
+ val geometricMean = exp(sumLogValues / powerSpectrum.size)
577
+
578
+ // Calculate arithmetic mean
579
+ val arithmeticMean = powerSpectrum.sum() / powerSpectrum.size
580
+
581
+ return if (arithmeticMean != 0f) geometricMean / arithmeticMean else 0f
582
+ }
583
+
584
+ private fun computeSpectralRollOff(magnitudeSpectrum: FloatArray, sampleRate: Float): Float {
585
+ val totalEnergy = magnitudeSpectrum.sum()
586
+ var cumulativeEnergy = 0f
587
+ val rollOffThreshold = totalEnergy * 0.85f
588
+
589
+ for ((index, value) in magnitudeSpectrum.withIndex()) {
590
+ cumulativeEnergy += value
591
+ if (cumulativeEnergy >= rollOffThreshold) {
592
+ return index * (sampleRate / N_FFT)
593
+ }
594
+ }
595
+
596
+ return 0f
597
+ }
598
+
599
+ private fun computeSpectralBandwidth(
600
+ magnitudeSpectrum: FloatArray,
601
+ sampleRate: Float,
602
+ centroid: Float
603
+ ): Float {
604
+ val sum = magnitudeSpectrum.sum()
605
+ if (sum == 0f) return 0f
606
+
607
+ // Match iOS frequency calculation
608
+ val weightedSum = magnitudeSpectrum.mapIndexed { index, value ->
609
+ val freq = index * sampleRate / (2 * magnitudeSpectrum.size)
610
+ value * (freq - centroid).pow(2)
611
+ }.sum()
612
+
613
+ return sqrt(weightedSum / sum)
614
+ }
615
+
616
+ private data class SpectralFeatures(
617
+ val centroid: Float = 0f,
618
+ val flatness: Float = 0f,
619
+ val rollOff: Float = 0f,
620
+ val bandwidth: Float = 0f
621
+ )
622
+
433
623
  /**
434
624
  * Resets the segment data.
435
625
  * @param sumSquaresUpdater Function to reset sum of squares.
@@ -453,45 +643,38 @@ class AudioProcessor(private val filesDir: File) {
453
643
  }
454
644
 
455
645
  /**
456
- * Extracts the MFCC (Mel-Frequency Cepstral Coefficients) from the audio data.
457
- * @param segmentData The segment data.
458
- * @param sampleRate The sample rate of the audio data.
459
- * @return The MFCC coefficients.
646
+ * Computes the MFCC (Mel-Frequency Cepstral Coefficients) from the audio data.
460
647
  */
461
- private fun extractMFCC(segmentData: FloatArray, sampleRate: Float): List<Float> {
462
- if (segmentData.size < 2) {
463
- Log.e("AudioProcessor", "Segment data is too small for MFCC extraction: size=${segmentData.size}")
464
- return emptyList()
465
- }
466
-
467
- val fftData = segmentData.copyOf()
468
- val fft = FFT(fftData.size)
469
- fft.realForward(fftData)
648
+ private fun computeMFCC(samples: FloatArray, sampleRate: Float): List<Float> {
649
+ val (powerSpectrum, _) = prepareFFT(samples, sampleRate)
650
+ val melFilters = computeMelFilterbank(
651
+ numFilters = 26,
652
+ powerSpectrumSize = powerSpectrum.size,
653
+ sampleRate = sampleRate
654
+ )
470
655
 
471
- // Compute the power spectrum
472
- val powerSpectrum = try {
473
- fftData.map { it * it }.chunked(2) { (re, im) -> sqrt(re + im) }
474
- } catch (e: Exception) {
475
- Log.e("AudioProcessor", "Error computing power spectrum: ${e.message}", e)
656
+ if (melFilters.any { it.size != powerSpectrum.size }) {
657
+ Log.e("AudioProcessor", "Mel filter size (${melFilters[0].size}) does not match power spectrum size (${powerSpectrum.size})")
476
658
  return emptyList()
477
659
  }
478
660
 
479
- // Compute Mel filter bank
480
- val melFilterBank = computeMelFilterBank(NUM_MEL_FILTERS, powerSpectrum.size, sampleRate)
481
- val filterEnergies = melFilterBank.map { filter ->
482
- filter.zip(powerSpectrum).sumOf { (f, p) -> (f * p).toDouble() }.toFloat()
661
+ val melEnergies = FloatArray(26) { i ->
662
+ var energy = 0f
663
+ for (j in powerSpectrum.indices) {
664
+ energy += powerSpectrum[j] * melFilters[i][j]
665
+ }
666
+ ln(maxOf(energy, 1e-10f))
483
667
  }
484
668
 
485
- // Apply log to filter energies
486
- val logEnergies = filterEnergies.map { ln(it + Float.MIN_VALUE) }
487
-
488
- // Compute Discrete Cosine Transform (DCT) of log energies to get MFCCs
489
- return try {
490
- computeDCT(logEnergies, NUM_MFCC_COEFFICIENTS)
491
- } catch (e: Exception) {
492
- Log.e("AudioProcessor", "Error computing DCT: ${e.message}", e)
493
- emptyList()
669
+ val mfcc = FloatArray(13) { i ->
670
+ var sum = 0f
671
+ for (j in melEnergies.indices) {
672
+ sum += melEnergies[j] * cos(PI * i * (2 * j + 1) / (2 * 26)).toFloat()
673
+ }
674
+ sum * sqrt(2f / 26)
494
675
  }
676
+
677
+ return mfcc.toList()
495
678
  }
496
679
 
497
680
  /**
@@ -501,32 +684,53 @@ class AudioProcessor(private val filesDir: File) {
501
684
  * @param sampleRate The sample rate of the audio data.
502
685
  * @return A list of Mel filters.
503
686
  */
504
- private fun computeMelFilterBank(numFilters: Int, powerSpectrumSize: Int, sampleRate: Float): List<List<Float>> {
505
- val melFilters = mutableListOf<List<Float>>()
506
- val melMaxFreq = MEL_MAX_FREQ_DIVISOR * log10(1.0 + sampleRate / 2.0 / MEL_MAX_FREQ_CONSTANT)
507
- val melPoints = DoubleArray(numFilters + 2) { i ->
508
- MEL_MIN_FREQ + i * (melMaxFreq - MEL_MIN_FREQ) / (numFilters + 1)
687
+ private fun computeMelFilterbank(numFilters: Int, powerSpectrumSize: Int, sampleRate: Float): Array<FloatArray> {
688
+ val fMin = 0f
689
+ val fMax = sampleRate / 2
690
+
691
+ // Convert Hz to Mel
692
+ val melMin = hzToMel(fMin)
693
+ val melMax = hzToMel(fMax)
694
+
695
+ // Create equally spaced points in Mel scale
696
+ val melPoints = FloatArray(numFilters + 2)
697
+ val melStep = (melMax - melMin) / (numFilters + 1)
698
+ for (i in melPoints.indices) {
699
+ melPoints[i] = melMin + i * melStep
509
700
  }
510
701
 
511
- val hzPoints = melPoints.map { MEL_MAX_FREQ_CONSTANT * (LOG_BASE.pow(it / MEL_MAX_FREQ_DIVISOR) - 1.0) }
512
- val bin = hzPoints.map { it * (powerSpectrumSize - 1) / sampleRate }
702
+ // Convert back to Hz
703
+ val hzPoints = melPoints.map { melToHz(it) }
704
+
705
+ // Convert to FFT bin numbers, clamping to valid range
706
+ val bins = hzPoints.map { minOf((it * powerSpectrumSize / sampleRate).roundToInt(), powerSpectrumSize - 1) }.toList()
707
+
708
+ // Create the filterbank matrix with size matching powerSpectrumSize
709
+ val filterbank = Array(numFilters) { FloatArray(powerSpectrumSize) { 0f } }
513
710
 
514
- for (i in 1..numFilters) {
515
- val filter = FloatArray(powerSpectrumSize)
516
- for (j in bin[i - 1].toInt() until bin[i].toInt()) {
517
- if (j >= 0 && j < filter.size) {
518
- filter[j] = ((j - bin[i - 1]) / (bin[i] - bin[i - 1])).toFloat()
711
+ // Ensure safe access to bins by limiting the loop and checking boundaries
712
+ for (i in 0 until numFilters) {
713
+ if (i + 2 < bins.size) { // Check to prevent out-of-bounds access
714
+ val startBin = bins[i]
715
+ val centerBin = bins[i + 1]
716
+ val endBin = bins[i + 2]
717
+
718
+ // Left slope (ascending triangle)
719
+ if (centerBin > startBin) {
720
+ for (j in startBin until centerBin) {
721
+ filterbank[i][j] = (j - startBin).toFloat() / (centerBin - startBin).toFloat()
722
+ }
519
723
  }
520
- }
521
- for (j in bin[i].toInt() until bin[i + 1].toInt()) {
522
- if (j >= 0 && j < filter.size) {
523
- filter[j] = ((bin[i + 1] - j) / (bin[i + 1] - bin[i])).toFloat()
724
+ // Right slope (descending triangle)
725
+ if (endBin > centerBin) {
726
+ for (j in centerBin until endBin) {
727
+ filterbank[i][j] = (endBin - j).toFloat() / (endBin - centerBin).toFloat()
728
+ }
524
729
  }
525
730
  }
526
- melFilters.add(filter.toList())
527
731
  }
528
732
 
529
- return melFilters
733
+ return filterbank
530
734
  }
531
735
 
532
736
  /**
@@ -550,168 +754,11 @@ class AudioProcessor(private val filesDir: File) {
550
754
  return dct.toList()
551
755
  }
552
756
 
553
- /**
554
- * Extracts the spectral centroid from the audio data.
555
- * @param segmentData The segment data.
556
- * @param sampleRate The sample rate of the audio data.
557
- * @return The spectral centroid.
558
- */
559
- private fun extractSpectralCentroid(segmentData: FloatArray, sampleRate: Float): Float {
560
- val magnitudeSpectrum = segmentData.map { it * it }.toFloatArray()
561
- val sum = magnitudeSpectrum.sum()
562
- if (sum == 0f) return 0f
563
-
564
- val weightedSum = magnitudeSpectrum.mapIndexed { index, value -> index * value }.sum()
565
- return (weightedSum / sum) * (sampleRate / 2) / magnitudeSpectrum.size
566
- }
567
-
568
- /**
569
- * Extracts the spectral flatness from the audio data.
570
- * @param segmentData The segment data.
571
- * @return The spectral flatness.
572
- */
573
- private fun extractSpectralFlatness(segmentData: FloatArray): Float {
574
- val magnitudeSpectrum = segmentData.map { abs(it) }
575
- val geometricMean = exp(magnitudeSpectrum.map { ln(it + Float.MIN_VALUE) }.average()).toFloat()
576
- val arithmeticMean = magnitudeSpectrum.average().toFloat()
577
- return if (arithmeticMean != 0f) geometricMean / arithmeticMean else 0f
578
- }
579
-
580
- /**
581
- * Extracts the spectral roll-off from the audio data.
582
- * @param segmentData The segment data.
583
- * @param sampleRate The sample rate of the audio data.
584
- * @return The spectral roll-off.
585
- */
586
- private fun extractSpectralRollOff(segmentData: FloatArray, sampleRate: Float): Float {
587
- val magnitudeSpectrum = segmentData.map { abs(it) }
588
- val totalEnergy = magnitudeSpectrum.sum()
589
- var cumulativeEnergy = 0f
590
- val rollOffThreshold = totalEnergy * 0.85f
591
-
592
- for ((index, value) in magnitudeSpectrum.withIndex()) {
593
- cumulativeEnergy += value
594
- if (cumulativeEnergy >= rollOffThreshold) {
595
- return index.toFloat() / magnitudeSpectrum.size * (sampleRate / 2)
596
- }
597
- }
598
-
599
- return 0f
600
- }
601
-
602
- /**
603
- * Extracts the spectral bandwidth from the audio data.
604
- * @param segmentData The segment data.
605
- * @param sampleRate The sample rate of the audio data.
606
- * @return The spectral bandwidth.
607
- */
608
- private fun extractSpectralBandwidth(segmentData: FloatArray, sampleRate: Float): Float {
609
- val centroid = extractSpectralCentroid(segmentData, sampleRate)
610
- val magnitudeSpectrum = segmentData.map { abs(it) }
611
- val sum = magnitudeSpectrum.sum()
612
- if (sum == 0f) return 0f
613
-
614
- val weightedSum = magnitudeSpectrum.mapIndexed { index, value -> value * (index - centroid).pow(2) }.sum()
615
- return sqrt(weightedSum / sum)
616
- }
617
-
618
- /**
619
- * Extracts the chromagram from the audio data.
620
- * @param segmentData The segment data.
621
- * @param sampleRate The sample rate of the audio data.
622
- * @return The chromagram.
623
- */
624
- private fun extractChromagram(segmentData: FloatArray, sampleRate: Float): List<Float> {
625
- val fftData = segmentData.copyOf()
626
- val fft = FFT(fftData.size)
627
- fft.realForward(fftData)
628
-
629
- // Compute the magnitude spectrum
630
- val magnitudeSpectrum = fftData.map { abs(it) }
631
-
632
- // Initialize the chromagram with 12 bins (one for each pitch class)
633
- val chromagram = FloatArray(12)
634
-
635
- // Map frequencies to pitch classes
636
- for (i in magnitudeSpectrum.indices) {
637
- val freq = i * sampleRate / magnitudeSpectrum.size
638
- val pitchClass = (12 * log2(freq / 440.0) % 12).toInt()
639
- if (pitchClass in 0..11) {
640
- chromagram[pitchClass] += magnitudeSpectrum[i]
641
- }
642
- }
643
-
644
- return chromagram.toList()
645
- }
646
-
647
- /**
648
- * Extracts the tempo from the audio data.
649
- * @param segmentData The segment data.
650
- * @param sampleRate The sample rate of the audio data.
651
- * @return The tempo.
652
- */
653
- private fun extractTempo(segmentData: FloatArray, sampleRate: Float): Float {
654
- // Calculate the onset strength envelope
655
- val onsetEnv = calculateOnsetEnvelope(segmentData, sampleRate)
656
-
657
- // Find peaks in the onset envelope
658
- val peaks = findPeaks(onsetEnv)
659
-
660
- // Calculate the inter-onset intervals (IOIs)
661
- val iois = peaks.zipWithNext { a, b -> (b - a).toFloat() / sampleRate }
662
-
663
- // Calculate the tempo in beats per minute (BPM)
664
- val avgIoi = iois.average().toFloat()
665
- return if (avgIoi != 0f) 60f / avgIoi else 0f
666
- }
667
-
668
- /**
669
- * Calculates the onset envelope of the audio signal.
670
- * @param segmentData The segment data.
671
- * @param sampleRate The sample rate of the audio data.
672
- * @return The onset envelope.
673
- */
674
- private fun calculateOnsetEnvelope(segmentData: FloatArray, sampleRate: Float): FloatArray {
675
- val frameSize = sampleRate.toInt() / 100 // Assume 10ms frames
676
- val onsetEnv = FloatArray(segmentData.size / frameSize)
677
- var previousSpectrum = FloatArray(frameSize)
678
-
679
- for (i in onsetEnv.indices) {
680
- val frame = segmentData.sliceArray(i * frameSize until min((i + 1) * frameSize, segmentData.size))
681
- val magnitudeSpectrum = frame.map { abs(it) }.toFloatArray()
682
- val onset = magnitudeSpectrum.zip(previousSpectrum) { a, b -> max(0f, a - b) }.sum()
683
- onsetEnv[i] = onset
684
- previousSpectrum = magnitudeSpectrum
685
- }
686
-
687
- return onsetEnv
688
- }
689
-
690
- /**
691
- * Finds the peaks in the onset envelope.
692
- * @param onsetEnv The onset envelope.
693
- * @return A list of peak indices.
694
- */
695
- private fun findPeaks(onsetEnv: FloatArray): List<Int> {
696
- val peaks = mutableListOf<Int>()
697
- for (i in 1 until onsetEnv.size - 1) {
698
- if (onsetEnv[i] > onsetEnv[i - 1] && onsetEnv[i] > onsetEnv[i + 1]) {
699
- peaks.add(i)
700
- }
701
- }
702
- return peaks
703
- }
704
-
705
757
  /**
706
758
  * Extracts the HNR (Harmonics-to-Noise Ratio) from the audio data.
707
759
  * @param segmentData The segment data.
708
760
  * @return The HNR.
709
761
  */
710
- /**
711
- * Extracts the HNR (Harmonics-to-Noise Ratio) from the audio data.
712
- * @param segmentData The segment data as FloatArray.
713
- * @return The HNR.
714
- */
715
762
  private fun extractHNR(segmentData: FloatArray): Float {
716
763
  val frameSize = segmentData.size
717
764
  val autocorrelation = FloatArray(frameSize)
@@ -725,11 +772,33 @@ class AudioProcessor(private val filesDir: File) {
725
772
  autocorrelation[i] = sum
726
773
  }
727
774
 
728
- // Find the maximum autocorrelation value (excluding the zero lag)
729
- val maxAutocorrelation = autocorrelation.drop(1).maxOrNull() ?: 0f
775
+ // Find peaks with minimum prominence
776
+ val maxAutocorrelation = autocorrelation.maxOrNull() ?: 0f
777
+ val peaks = findPeaks(autocorrelation, minProminence = 0.1f * maxAutocorrelation)
778
+
779
+ if (peaks.isNotEmpty()) {
780
+ val firstPeakIndex = peaks.firstOrNull { it > 0 } ?: 0
781
+ val harmonicEnergy = autocorrelation[firstPeakIndex]
782
+ val noiseEnergy = autocorrelation[0] - harmonicEnergy
783
+ if (noiseEnergy > 0) {
784
+ return 10 * log10(harmonicEnergy / noiseEnergy)
785
+ }
786
+ }
787
+
788
+ return 0f
789
+ }
730
790
 
731
- // Compute the HNR
732
- return if (autocorrelation[0] != 0f) 10 * log10(maxAutocorrelation / (autocorrelation[0] - maxAutocorrelation)) else 0f
791
+ private fun findPeaks(data: FloatArray, minProminence: Float): List<Int> {
792
+ val peaks = mutableListOf<Int>()
793
+ for (i in 1 until data.size - 1) {
794
+ if (data[i] > data[i - 1] && data[i] > data[i + 1]) {
795
+ val prominence = data[i] - maxOf(data[i - 1], data[i + 1])
796
+ if (prominence >= minProminence) {
797
+ peaks.add(i)
798
+ }
799
+ }
800
+ }
801
+ return peaks
733
802
  }
734
803
 
735
804
  fun loadAudioFromAnyFormat(fileUri: String, decodingConfig: DecodingConfig? = null): AudioData? {
@@ -799,7 +868,7 @@ class AudioProcessor(private val filesDir: File) {
799
868
  // If MediaExtractor failed and file is WAV, try WAV parser
800
869
  if (file.name.lowercase().endsWith(".wav")) {
801
870
  Log.d("AudioProcessor", "Falling back to WAV parser")
802
- return loadAudioFile(file.absolutePath, false)?.let { wavData ->
871
+ return loadAudioFile(file.absolutePath)?.let { wavData ->
803
872
  if (decodingConfig != null) {
804
873
  val processedData = processAudio(
805
874
  wavData.data,
@@ -987,7 +1056,7 @@ class AudioProcessor(private val filesDir: File) {
987
1056
  val inputBuffer = ByteBuffer.wrap(pcmData).order(ByteOrder.LITTLE_ENDIAN).asShortBuffer()
988
1057
  val outputBuffer = ByteBuffer.wrap(result).order(ByteOrder.LITTLE_ENDIAN).asShortBuffer()
989
1058
 
990
- for (i in 0 until result.size) {
1059
+ for (i in result.indices) {
991
1060
  val channelData = ShortArray(targetChannels)
992
1061
  for (j in 0 until targetChannels) {
993
1062
  channelData[j] = inputBuffer.get()
@@ -1076,6 +1145,8 @@ class AudioProcessor(private val filesDir: File) {
1076
1145
  val dataPoints = mutableListOf<DataPoint>()
1077
1146
  var minAmplitude = Float.MAX_VALUE
1078
1147
  var maxAmplitude = Float.MIN_VALUE
1148
+ var minRms = Float.MAX_VALUE // Add minRms
1149
+ var maxRms = Float.MIN_VALUE // Add maxRms
1079
1150
 
1080
1151
  val extractionTimeMs = measureTimeMillis {
1081
1152
  for (i in 0 until numberOfPoints) {
@@ -1098,22 +1169,27 @@ class AudioProcessor(private val filesDir: File) {
1098
1169
  val startTimePoint = ((pointStartSample * 1000L) / (audioData.sampleRate * audioData.channels)).toFloat()
1099
1170
  val endTimePoint = ((pointEndSample * 1000L) / (audioData.sampleRate * audioData.channels)).toFloat()
1100
1171
 
1101
- val amplitude = when (config.algorithm.lowercase()) {
1102
- "peak" -> segmentData.maxOf { abs(it) }
1103
- else -> sqrt(segmentData.map { it * it }.average().toFloat())
1104
- }
1172
+ val rms = sqrt(segmentData.map { it * it }.average().toFloat())
1173
+ val amplitude = segmentData.maxOf { abs(it) } // Always use peak amplitude
1105
1174
 
1106
1175
  minAmplitude = minOf(minAmplitude, amplitude)
1107
1176
  maxAmplitude = maxOf(maxAmplitude, amplitude)
1177
+ minRms = minOf(minRms, rms)
1178
+ maxRms = maxOf(maxRms, rms)
1108
1179
 
1109
1180
  dataPoints.add(DataPoint(
1110
1181
  id = i.toLong(),
1111
- amplitude = amplitude,
1182
+ amplitude = amplitude, // Peak amplitude
1183
+ rms = rms, // RMS value
1184
+ dB = 20 * log10(amplitude.toDouble()).toFloat(),
1185
+ silent = amplitude < 0.01,
1186
+ features = null,
1187
+ speech = null,
1112
1188
  startTime = startTimePoint,
1113
1189
  endTime = endTimePoint,
1114
1190
  startPosition = pointStartSample,
1115
1191
  endPosition = pointEndSample,
1116
- samples = pointEndSample - pointStartSample
1192
+ samples = segmentData.size
1117
1193
  ))
1118
1194
  } catch (e: Exception) {
1119
1195
  Log.e(Constants.TAG, "Error processing segment $i: ${e.message}")
@@ -1127,7 +1203,7 @@ class AudioProcessor(private val filesDir: File) {
1127
1203
  }
1128
1204
 
1129
1205
  return AudioAnalysisData(
1130
- pointsPerSecond = pointsPerSecond,
1206
+ segmentDurationMs = config.segmentDurationMs,
1131
1207
  durationMs = durationMs.toInt(),
1132
1208
  bitDepth = audioData.bitDepth,
1133
1209
  numberOfChannels = audioData.channels,
@@ -1135,7 +1211,7 @@ class AudioProcessor(private val filesDir: File) {
1135
1211
  samples = samplesInRange,
1136
1212
  dataPoints = dataPoints,
1137
1213
  amplitudeRange = AudioAnalysisData.AmplitudeRange(minAmplitude, maxAmplitude),
1138
- speakerChanges = emptyList(),
1214
+ rmsRange = AudioAnalysisData.AmplitudeRange(minRms, maxRms),
1139
1215
  extractionTimeMs = extractionTimeMs.toFloat()
1140
1216
  )
1141
1217
  }
@@ -1157,32 +1233,34 @@ class AudioProcessor(private val filesDir: File) {
1157
1233
  return bytes.map { (it.toInt() - 128).toFloat() / 127f }.toFloatArray()
1158
1234
  }
1159
1235
 
1160
- fun loadAudioRange(
1161
- fileUri: String,
1162
- startTimeMs: Long? = null,
1163
- endTimeMs: Long? = null,
1164
- config: DecodingConfig
1165
- ): AudioData? {
1236
+ fun loadAudioRange(fileUri: String, startTimeMs: Long, endTimeMs: Long, config: DecodingConfig? = null): AudioData? {
1166
1237
  try {
1167
- // Clean up the URI and get a proper File object
1168
- val cleanUri = fileUri.removePrefix("file://")
1169
- val file = File(cleanUri).takeIf { it.exists() } ?: File(filesDir, File(cleanUri).name).takeIf { it.exists() }
1170
- ?: run {
1171
- Log.e(Constants.TAG, "File not found in any location: $cleanUri")
1172
- return null
1173
- }
1174
-
1175
- // Check if it's a WAV file by reading first 4 bytes
1176
- val isWav = FileInputStream(file).use { fis ->
1177
- val header = ByteArray(4)
1178
- fis.read(header)
1179
- String(header) == "RIFF"
1180
- }
1238
+ // Use default config if none provided
1239
+ val effectiveConfig = config ?: DecodingConfig(
1240
+ targetSampleRate = null,
1241
+ targetChannels = null,
1242
+ targetBitDepth = 16,
1243
+ normalizeAudio = false
1244
+ )
1181
1245
 
1182
- return if (isWav) {
1183
- loadWavRange(file, startTimeMs, endTimeMs, config)
1246
+ // First check if it's a WAV file by extension
1247
+ val isWavByExtension = fileUri.lowercase().endsWith(".wav")
1248
+
1249
+ // Then verify WAV header if needed
1250
+ val headerSize = if (isWavByExtension) {
1251
+ getWavHeaderSize(fileUri)
1252
+ } else null
1253
+
1254
+ // If it's a WAV file (by extension and header verification)
1255
+ return if (isWavByExtension && headerSize != null) {
1256
+ Log.d(Constants.TAG, "Loading WAV range with header size: $headerSize bytes")
1257
+ loadWavRange(fileUri, startTimeMs, endTimeMs, effectiveConfig, headerSize)
1184
1258
  } else {
1185
- loadCompressedAudioRange(file, startTimeMs, endTimeMs, config)
1259
+ if (isWavByExtension) {
1260
+ Log.w(Constants.TAG, "File has .wav extension but invalid header, falling back to compressed loader")
1261
+ }
1262
+ Log.d(Constants.TAG, "Loading compressed audio range")
1263
+ loadCompressedAudioRange(fileUri, startTimeMs, endTimeMs, effectiveConfig)
1186
1264
  }
1187
1265
  } catch (e: Exception) {
1188
1266
  Log.e(Constants.TAG, "Failed to load audio range: ${e.message}", e)
@@ -1191,52 +1269,59 @@ class AudioProcessor(private val filesDir: File) {
1191
1269
  }
1192
1270
 
1193
1271
  private fun loadWavRange(
1194
- file: File,
1195
- startTimeMs: Long?,
1196
- endTimeMs: Long?,
1197
- config: DecodingConfig
1272
+ fileUri: String,
1273
+ startTimeMs: Long,
1274
+ endTimeMs: Long,
1275
+ config: DecodingConfig,
1276
+ headerSize: Int
1198
1277
  ): AudioData? {
1199
1278
  try {
1200
- // Read WAV header to get format info
1201
- val fis = FileInputStream(file)
1202
- val headerBuffer = ByteArray(44) // WAV header is 44 bytes
1203
- fis.read(headerBuffer)
1204
-
1205
- // Parse WAV header
1206
- val sampleRate = ByteBuffer.wrap(headerBuffer, 24, 4).order(ByteOrder.LITTLE_ENDIAN).int
1207
- val channels = ByteBuffer.wrap(headerBuffer, 22, 2).order(ByteOrder.LITTLE_ENDIAN).short.toInt()
1208
- val bitDepth = ByteBuffer.wrap(headerBuffer, 34, 2).order(ByteOrder.LITTLE_ENDIAN).short.toInt()
1209
-
1210
- // Calculate duration
1211
- val bytesPerFrame = channels * (bitDepth / 8)
1212
- val numFrames = (file.length() - 44) / bytesPerFrame // Subtract header size
1213
- val durationMs = (numFrames * 1000L) / sampleRate
1279
+ val file = File(fileUri.removePrefix("file://")).takeIf { it.exists() }
1280
+ ?: File(filesDir, File(fileUri).name).takeIf { it.exists() }
1281
+ ?: throw IllegalArgumentException("File not found: $fileUri")
1282
+
1283
+ // Use existing method to get audio format
1284
+ val format = getAudioFormat(fileUri) ?: throw IllegalArgumentException("Could not determine audio format")
1214
1285
 
1215
- // Calculate positions
1216
- val startByte = 44 + ((startTimeMs ?: 0) * sampleRate * bytesPerFrame / 1000)
1217
- val endByte = 44 + ((endTimeMs ?: (file.length() * 1000 / (sampleRate * bytesPerFrame))) * sampleRate * bytesPerFrame / 1000)
1218
- val length = (endByte - startByte).toInt()
1286
+ val bytesPerSecond = format.sampleRate * format.channels * (format.bitDepth / 8)
1287
+ val startByteOffset = ((startTimeMs * bytesPerSecond) / 1000).toInt()
1288
+ val endByteOffset = ((endTimeMs * bytesPerSecond) / 1000).toInt()
1219
1289
 
1290
+ val startByte = headerSize + startByteOffset
1291
+ val endByte = headerSize + endByteOffset
1292
+
1220
1293
  Log.d(Constants.TAG, """
1221
- Loading WAV section:
1222
- - start: ${startTimeMs}ms (pos: $startByte)
1223
- - end: ${endTimeMs}ms (pos: $endByte)
1224
- - length: $length bytes
1225
- - format: ${sampleRate}Hz, $channels channels, $bitDepth-bit
1294
+ Loading WAV range:
1295
+ - headerSize: $headerSize
1296
+ - startByte: $startByte
1297
+ - endByte: $endByte
1298
+ - bytesPerSecond: $bytesPerSecond
1226
1299
  """.trimIndent())
1227
-
1228
- // Read the requested section
1229
- val audioData = ByteArray(length)
1230
- fis.skip(startByte - 44) // Skip to start position (accounting for header we already read)
1231
- fis.read(audioData)
1232
- fis.close()
1233
-
1300
+
1301
+ var audioDataBytes = ByteArray((endByte - startByte).coerceAtLeast(0))
1302
+ FileInputStream(file).use { fis ->
1303
+ fis.skip(startByte.toLong())
1304
+ fis.read(audioDataBytes)
1305
+ }
1306
+
1307
+ // Apply bit depth conversion if needed
1308
+ var effectiveBitDepth = format.bitDepth
1309
+ if (config.targetBitDepth != format.bitDepth) {
1310
+ audioDataBytes = AudioFormatUtils.convertBitDepth(
1311
+ audioDataBytes,
1312
+ format.bitDepth,
1313
+ config.targetBitDepth
1314
+ )
1315
+ effectiveBitDepth = config.targetBitDepth
1316
+ Log.d(Constants.TAG, "Converted bit depth from ${format.bitDepth} to ${config.targetBitDepth}")
1317
+ }
1318
+
1234
1319
  return AudioData(
1235
- data = audioData,
1236
- sampleRate = config.targetSampleRate ?: sampleRate,
1237
- channels = config.targetChannels ?: channels,
1238
- bitDepth = config.targetBitDepth ?: bitDepth,
1239
- durationMs = durationMs // Pass the duration
1320
+ data = audioDataBytes,
1321
+ sampleRate = format.sampleRate,
1322
+ channels = format.channels,
1323
+ bitDepth = effectiveBitDepth,
1324
+ durationMs = endTimeMs - startTimeMs
1240
1325
  )
1241
1326
  } catch (e: Exception) {
1242
1327
  Log.e(Constants.TAG, "Failed to load WAV range: ${e.message}", e)
@@ -1245,16 +1330,16 @@ class AudioProcessor(private val filesDir: File) {
1245
1330
  }
1246
1331
 
1247
1332
  private fun loadCompressedAudioRange(
1248
- file: File,
1249
- startTimeMs: Long?,
1250
- endTimeMs: Long?,
1333
+ fileUri: String,
1334
+ startTimeMs: Long,
1335
+ endTimeMs: Long,
1251
1336
  config: DecodingConfig
1252
1337
  ): AudioData? {
1253
1338
  val extractor = MediaExtractor()
1254
1339
  var decoder: MediaCodec? = null
1255
1340
 
1256
1341
  try {
1257
- extractor.setDataSource(file.absolutePath)
1342
+ extractor.setDataSource(fileUri.removePrefix("file://"))
1258
1343
  val format = extractor.getTrackFormat(0)
1259
1344
  extractor.selectTrack(0)
1260
1345
 
@@ -1271,8 +1356,8 @@ class AudioProcessor(private val filesDir: File) {
1271
1356
  Log.d("AudioProcessor", "Final duration: ${totalDurationMs}ms")
1272
1357
 
1273
1358
  // Calculate valid time range
1274
- val validStartMs = startTimeMs?.coerceIn(0, totalDurationMs) ?: 0
1275
- val validEndMs = endTimeMs?.coerceIn(validStartMs, totalDurationMs) ?: totalDurationMs
1359
+ val validStartMs = startTimeMs.coerceIn(0, totalDurationMs) ?: 0
1360
+ val validEndMs = endTimeMs.coerceIn(validStartMs, totalDurationMs) ?: totalDurationMs
1276
1361
  val effectiveDurationMs = validEndMs - validStartMs
1277
1362
 
1278
1363
  // Initialize decoder
@@ -1302,7 +1387,7 @@ class AudioProcessor(private val filesDir: File) {
1302
1387
  - format: ${targetSampleRate}Hz, $targetChannels channels, $targetBitDepth-bit
1303
1388
  """.trimIndent())
1304
1389
 
1305
- val outputBuffer = ByteBuffer.allocateDirect(totalBytes.toInt())
1390
+ val outputBuffer = ByteBuffer.allocate(totalBytes.toInt())
1306
1391
  val bufferInfo = MediaCodec.BufferInfo()
1307
1392
  var isEOS = false
1308
1393
 
@@ -1332,18 +1417,25 @@ class AudioProcessor(private val filesDir: File) {
1332
1417
  // Handle output
1333
1418
  val outputBufferId = decoder.dequeueOutputBuffer(bufferInfo, 10000)
1334
1419
  if (outputBufferId >= 0) {
1335
- val outputBuffer = decoder.getOutputBuffer(outputBufferId)!!
1420
+ val decodedBuffer = decoder.getOutputBuffer(outputBufferId)!!
1336
1421
  if (bufferInfo.size > 0) {
1337
- outputBuffer.limit(bufferInfo.offset + bufferInfo.size)
1338
- outputBuffer.position(bufferInfo.offset)
1339
- if (outputBuffer.remaining() <= totalBytes - outputBuffer.position()) {
1340
- outputBuffer.get(ByteArray(outputBuffer.remaining()))
1341
- }
1422
+ // Set buffer position and limit based on the decoded data
1423
+ decodedBuffer.position(bufferInfo.offset)
1424
+ decodedBuffer.limit(bufferInfo.offset + bufferInfo.size)
1425
+
1426
+ // Copy decoded data to our output buffer
1427
+ outputBuffer.put(decodedBuffer)
1342
1428
  }
1343
1429
  decoder.releaseOutputBuffer(outputBufferId, false)
1430
+
1431
+ // Check if we've reached the end
1432
+ if ((bufferInfo.flags and MediaCodec.BUFFER_FLAG_END_OF_STREAM) != 0) {
1433
+ isEOS = true
1434
+ }
1344
1435
  }
1345
1436
  }
1346
1437
 
1438
+ // Prepare the final byte array
1347
1439
  outputBuffer.flip()
1348
1440
  val audioData = ByteArray(outputBuffer.remaining())
1349
1441
  outputBuffer.get(audioData)
@@ -1353,7 +1445,7 @@ class AudioProcessor(private val filesDir: File) {
1353
1445
  sampleRate = targetSampleRate,
1354
1446
  channels = targetChannels,
1355
1447
  bitDepth = targetBitDepth,
1356
- durationMs = effectiveDurationMs // Pass the duration
1448
+ durationMs = endTimeMs - startTimeMs // Use the actual time range
1357
1449
  ).also {
1358
1450
  Log.d(Constants.TAG, "Loaded compressed audio with duration: ${effectiveDurationMs}ms")
1359
1451
  }
@@ -1483,4 +1575,362 @@ class AudioProcessor(private val filesDir: File) {
1483
1575
  // This will help ensure consistent format when joining sections
1484
1576
  return audioData
1485
1577
  }
1578
+
1579
+ // Add new function to process entire file
1580
+ fun processEntireFile(audioData: AudioData): Features {
1581
+ val samples = convertToFloatArray(audioData.data, audioData.bitDepth)
1582
+
1583
+ // Compute basic features for the entire file
1584
+ val sumSquares = samples.sumOf { it * it.toDouble() }.toFloat()
1585
+ val segmentLength = samples.size
1586
+ val zeroCrossings = countZeroCrossings(samples)
1587
+ val minAmplitude = samples.minOrNull() ?: 0f
1588
+ val maxAmplitude = samples.maxOrNull() ?: 0f
1589
+
1590
+ // Use existing computeFeatures with the entire file as one segment
1591
+ return computeFeatures(
1592
+ segmentData = samples,
1593
+ sampleRate = audioData.sampleRate.toFloat(),
1594
+ sumSquares = sumSquares,
1595
+ zeroCrossings = zeroCrossings,
1596
+ segmentLength = segmentLength,
1597
+ minAmplitude = minAmplitude,
1598
+ maxAmplitude = maxAmplitude,
1599
+ featureOptions = mapOf() // Dont compute complex features
1600
+ )
1601
+ }
1602
+
1603
+ private fun countZeroCrossings(data: FloatArray): Int {
1604
+ var crossings = 0
1605
+ for (i in 1 until data.size) {
1606
+ if (data[i - 1] * data[i] < 0) crossings++
1607
+ }
1608
+ return crossings
1609
+ }
1610
+
1611
+ private fun hzToMel(hz: Float): Float {
1612
+ return 2595f * log10(1f + hz / 700f)
1613
+ }
1614
+
1615
+ private fun melToHz(mel: Float): Float {
1616
+ return 700f * (10f.pow(mel / 2595f) - 1f)
1617
+ }
1618
+
1619
+ private fun applyHannWindow(samples: FloatArray): FloatArray {
1620
+ val output = FloatArray(samples.size)
1621
+ for (i in samples.indices) {
1622
+ val multiplier = 0.5f * (1f - cos(2f * PI.toFloat() * i / (samples.size - 1)))
1623
+ output[i] = samples[i] * multiplier
1624
+ }
1625
+ return output
1626
+ }
1627
+
1628
+ private fun computeMelSpectrogram(samples: FloatArray, sampleRate: Float): List<Float> {
1629
+ val (powerSpectrum, _) = prepareFFT(samples, sampleRate)
1630
+ val melFilters = computeMelFilterbank(
1631
+ numFilters = 128,
1632
+ powerSpectrumSize = powerSpectrum.size,
1633
+ sampleRate = sampleRate
1634
+ )
1635
+
1636
+ // Apply Mel filters to power spectrum
1637
+ return melFilters.map { filter ->
1638
+ var energy = 0f
1639
+ for (j in powerSpectrum.indices) {
1640
+ energy += powerSpectrum[j] * filter[j]
1641
+ }
1642
+ kotlin.math.ln(maxOf(energy, 1e-10f))
1643
+ }
1644
+ }
1645
+
1646
+ private fun computeChroma(samples: FloatArray, sampleRate: Float): List<Float> {
1647
+ val (_, magnitudeSpectrum) = prepareFFT(samples, sampleRate)
1648
+ val chroma = FloatArray(N_CHROMA) { 0f }
1649
+ val freqsPerBin = sampleRate / N_FFT
1650
+
1651
+ for (i in 0 until N_FFT / 2) {
1652
+ val freq = i * freqsPerBin
1653
+ if (freq > 0) {
1654
+ val pitchClass = (12 * log2(freq / 440.0) % 12).toInt()
1655
+ if (pitchClass in 0..11) {
1656
+ val magnitude = sqrt(magnitudeSpectrum[2 * i] * magnitudeSpectrum[2 * i] +
1657
+ (if (2 * i + 1 < magnitudeSpectrum.size) magnitudeSpectrum[2 * i + 1] else 0f) *
1658
+ magnitudeSpectrum[2 * i + 1])
1659
+ chroma[pitchClass] += magnitude
1660
+ }
1661
+ }
1662
+ }
1663
+
1664
+ return chroma.toList()
1665
+ }
1666
+
1667
+ private fun computeSpectralContrast(samples: FloatArray, sampleRate: Float): List<Float> {
1668
+ val (_, magnitudeSpectrum) = prepareFFT(samples, sampleRate)
1669
+ // ... rest of spectral contrast computation using magnitudeSpectrum ...
1670
+ // Implementation depends on your specific requirements
1671
+ return emptyList() // Placeholder
1672
+ }
1673
+
1674
+ private fun computeTonnetz(samples: FloatArray, sampleRate: Float): List<Float> {
1675
+ // First compute chroma features
1676
+ val chroma = computeChroma(samples, sampleRate)
1677
+
1678
+ // Tonnetz transformation matrix (6x12)
1679
+ val tonnetzMatrix = arrayOf(
1680
+ floatArrayOf(1f, 0f, 0f, 0f, 1f, 0f, 0f, 1f, 0f, 0f, 0f, 0f), // Perfect fifth
1681
+ floatArrayOf(0f, 1f, 0f, 0f, 0f, 1f, 0f, 0f, 1f, 0f, 0f, 0f, 0f), // Minor third
1682
+ floatArrayOf(0f, 0f, 1f, 0f, 0f, 0f, 1f, 0f, 0f, 1f, 0f, 0f), // Major third
1683
+ floatArrayOf(0f, 0f, 0f, 1f, 0f, 0f, 0f, 1f, 0f, 0f, 1f, 0f), // Perfect fifth
1684
+ floatArrayOf(0f, 0f, 0f, 0f, 1f, 0f, 0f, 0f, 1f, 0f, 0f, 0f, 0f, 0f, 1f, 0f), // Minor third
1685
+ floatArrayOf(1f, 0f, 0f, 0f, 0f, 1f, 0f, 0f, 0f, 1f, 0f, 0f) // Major third
1686
+ )
1687
+
1688
+ // Compute tonnetz features
1689
+ val tonnetz = mutableListOf<Float>()
1690
+ for (row in tonnetzMatrix) {
1691
+ var sum = 0f
1692
+ for (i in row.indices) {
1693
+ sum += row[i] * (chroma.getOrNull(i) ?: 0f)
1694
+ }
1695
+ tonnetz.add(sum)
1696
+ }
1697
+
1698
+ return tonnetz
1699
+ }
1700
+
1701
+ private fun nextPowerOfTwo(n: Int): Int {
1702
+ var value = 1
1703
+ while (value < n) {
1704
+ value *= 2
1705
+ }
1706
+ return value
1707
+ }
1708
+
1709
+ private fun estimatePitch(segment: FloatArray, sampleRate: Float): Float {
1710
+ if (segment.size < 2) return 0.0f
1711
+
1712
+ // Apply Hann window
1713
+ val windowed = applyHannWindow(segment)
1714
+
1715
+ // Pad for FFT - ensure length is power of 2 and sufficient for autocorrelation
1716
+ val fftLength = nextPowerOfTwo(segment.size * 2)
1717
+ val padded = FloatArray(fftLength) // Initialize with zeros
1718
+ windowed.copyInto(padded) // Copy windowed data into padded array
1719
+
1720
+ // Perform forward FFT
1721
+ val fft = FFT(fftLength)
1722
+ try {
1723
+ fft.realForward(padded)
1724
+ } catch (e: Exception) {
1725
+ Log.e("AudioProcessor", "FFT forward transform failed: ${e.message}")
1726
+ return 0.0f
1727
+ }
1728
+
1729
+ // Compute power spectrum
1730
+ val powerSpectrum = FloatArray(fftLength)
1731
+ try {
1732
+ // Handle DC and Nyquist components separately
1733
+ powerSpectrum[0] = padded[0] * padded[0]
1734
+ powerSpectrum[fftLength/2] = padded[1] * padded[1]
1735
+
1736
+ // Handle remaining frequencies
1737
+ for (i in 1 until fftLength/2) {
1738
+ val re = padded[2 * i]
1739
+ val im = padded[2 * i + 1]
1740
+ powerSpectrum[i] = re * re + im * im
1741
+ powerSpectrum[fftLength - i] = powerSpectrum[i] // Mirror for inverse FFT
1742
+ }
1743
+ } catch (e: Exception) {
1744
+ Log.e("AudioProcessor", "Power spectrum computation failed: ${e.message}")
1745
+ return 0.0f
1746
+ }
1747
+
1748
+ // Inverse FFT to get autocorrelation
1749
+ val autocorrelation = FloatArray(fftLength)
1750
+ try {
1751
+ fft.realInverse(powerSpectrum, autocorrelation)
1752
+ } catch (e: Exception) {
1753
+ Log.e("AudioProcessor", "FFT inverse transform failed: ${e.message}")
1754
+ return 0.0f
1755
+ }
1756
+
1757
+ // Normalize autocorrelation
1758
+ val normFactor = 1.0f / autocorrelation[0] // Normalize by zero-lag autocorrelation
1759
+ for (i in autocorrelation.indices) {
1760
+ autocorrelation[i] *= normFactor
1761
+ }
1762
+
1763
+ // Find the first peak within pitch range (50-500 Hz)
1764
+ val minLag = (sampleRate / 500.0f).toInt().coerceAtLeast(1)
1765
+ val maxLag = (sampleRate / 50.0f).toInt().coerceAtMost(autocorrelation.size - 1)
1766
+
1767
+ var maxCorr = -1.0f
1768
+ var pitchLag = 0
1769
+
1770
+ // Add peak picking criteria
1771
+ val threshold = 0.3f // Correlation threshold
1772
+ var isPeak = false
1773
+
1774
+ for (lag in minLag..maxLag) {
1775
+ if (lag > 0 && lag < autocorrelation.size - 1) {
1776
+ // Check if this point is a peak
1777
+ isPeak = autocorrelation[lag] > autocorrelation[lag - 1] &&
1778
+ autocorrelation[lag] > autocorrelation[lag + 1] &&
1779
+ autocorrelation[lag] > threshold
1780
+
1781
+ if (isPeak && autocorrelation[lag] > maxCorr) {
1782
+ maxCorr = autocorrelation[lag]
1783
+ pitchLag = lag
1784
+ }
1785
+ }
1786
+ }
1787
+
1788
+ return if (pitchLag > 0) sampleRate / pitchLag else 0.0f
1789
+ }
1790
+
1791
+ /**
1792
+ * Prepares FFT by applying Hann window, padding, and computing both power and magnitude spectra.
1793
+ * @param samples Input audio samples
1794
+ * @param sampleRate Sampling rate in Hz
1795
+ * @param fftLength FFT size (must be power of 2)
1796
+ * @return Pair of power spectrum and magnitude spectrum
1797
+ */
1798
+ private fun prepareFFT(samples: FloatArray, sampleRate: Float, fftLength: Int = nextPowerOfTwo(samples.size.coerceAtLeast(2048))): Pair<FloatArray, FloatArray> {
1799
+ val windowed = applyHannWindow(samples)
1800
+ val padded = windowed.copyOf(fftLength)
1801
+ val fft = FFT(fftLength)
1802
+ fft.realForward(padded)
1803
+
1804
+ val magnitudeSpectrum = FloatArray(fftLength / 2 + 1)
1805
+ for (i in 0 until fftLength / 2) {
1806
+ val re = padded[2 * i]
1807
+ val im = padded[2 * i + 1]
1808
+ magnitudeSpectrum[i] = sqrt(re * re + im * im)
1809
+ }
1810
+ magnitudeSpectrum[fftLength / 2] = abs(padded[1])
1811
+
1812
+ val powerSpectrum = magnitudeSpectrum.map { it * it }.toFloatArray()
1813
+ return Pair(powerSpectrum, magnitudeSpectrum)
1814
+ }
1815
+
1816
+ data class AudioFormat(
1817
+ val sampleRate: Int,
1818
+ val channels: Int,
1819
+ val bitDepth: Int
1820
+ )
1821
+
1822
+ fun getAudioFormat(fileUri: String): AudioFormat? {
1823
+ val cleanUri = fileUri.removePrefix("file://")
1824
+ val file = File(cleanUri).takeIf { it.exists() } ?: File(filesDir, File(cleanUri).name).takeIf { it.exists() }
1825
+ ?: run {
1826
+ Log.e(Constants.TAG, "File not found: $cleanUri")
1827
+ return null
1828
+ }
1829
+
1830
+ val extractor = MediaExtractor()
1831
+ try {
1832
+ extractor.setDataSource(file.absolutePath)
1833
+ val format = extractor.getTrackFormat(0)
1834
+ return AudioFormat(
1835
+ sampleRate = format.getInteger(MediaFormat.KEY_SAMPLE_RATE),
1836
+ channels = format.getInteger(MediaFormat.KEY_CHANNEL_COUNT),
1837
+ bitDepth = 16 // Most compressed formats decode to 16-bit PCM
1838
+ )
1839
+ } catch (e: Exception) {
1840
+ Log.e(Constants.TAG, "Failed to get audio format: ${e.message}")
1841
+ return null
1842
+ } finally {
1843
+ extractor.release()
1844
+ }
1845
+ }
1846
+
1847
+ /**
1848
+ * Gets the size of the audio file header.
1849
+ * For WAV files, this includes the RIFF header and all metadata chunks before the data chunk.
1850
+ * For other formats, this will return null as header size handling is format-specific.
1851
+ *
1852
+ * @param fileUri The URI of the audio file to analyze
1853
+ * @return The size of the header in bytes, or null if:
1854
+ * - The file is not a WAV file
1855
+ * - The file cannot be read
1856
+ * - The file format is invalid
1857
+ * - The data chunk cannot be found
1858
+ *
1859
+ * WAV File Structure:
1860
+ * - RIFF header (12 bytes)
1861
+ * - "RIFF" identifier (4 bytes)
1862
+ * - File size (4 bytes)
1863
+ * - "WAVE" identifier (4 bytes)
1864
+ * - Format chunk ("fmt ") (24 bytes typically)
1865
+ * - Optional metadata chunks (variable size)
1866
+ * - LIST (metadata like artist, title)
1867
+ * - JUNK (padding)
1868
+ * - fact (additional format info)
1869
+ * - cue (cue points)
1870
+ * - Data chunk
1871
+ * - "data" identifier (4 bytes)
1872
+ * - Chunk size (4 bytes)
1873
+ * - Actual audio data
1874
+ */
1875
+ fun getWavHeaderSize(fileUri: String): Int? {
1876
+ val cleanUri = fileUri.removePrefix("file://")
1877
+ val file = File(cleanUri).takeIf { it.exists() } ?: File(filesDir, File(cleanUri).name).takeIf { it.exists() }
1878
+ ?: run {
1879
+ Log.e(Constants.TAG, "File not found: $cleanUri")
1880
+ return null
1881
+ }
1882
+
1883
+ try {
1884
+ val inputStream = FileInputStream(file)
1885
+ val buffer = ByteArray(12) // Read RIFF header and chunk size
1886
+
1887
+ // Read RIFF header
1888
+ if (inputStream.read(buffer) != 12) {
1889
+ Log.e(Constants.TAG, "Failed to read RIFF header")
1890
+ return null
1891
+ }
1892
+
1893
+ // Verify RIFF header
1894
+ if (String(buffer, 0, 4) != "RIFF" || String(buffer, 8, 4) != "WAVE") {
1895
+ Log.e(Constants.TAG, "Invalid WAV file format")
1896
+ return null
1897
+ }
1898
+
1899
+ var headerSize = 12
1900
+ var chunkSize: Int
1901
+
1902
+ // Read chunks until we find the data chunk
1903
+ while (true) {
1904
+ if (inputStream.read(buffer, 0, 8) != 8) {
1905
+ Log.e(Constants.TAG, "Unexpected end of file while reading chunks")
1906
+ break
1907
+ }
1908
+
1909
+ chunkSize = (buffer[7].toInt() and 0xFF shl 24) or
1910
+ (buffer[6].toInt() and 0xFF shl 16) or
1911
+ (buffer[5].toInt() and 0xFF shl 8) or
1912
+ (buffer[4].toInt() and 0xFF)
1913
+
1914
+ val chunkId = String(buffer, 0, 4)
1915
+ Log.d(Constants.TAG, "Found chunk: $chunkId, size: $chunkSize")
1916
+
1917
+ if (chunkId == "data") {
1918
+ headerSize += 8 // Add chunk header size
1919
+ Log.d(Constants.TAG, "Found data chunk at offset: $headerSize")
1920
+ break
1921
+ }
1922
+
1923
+ headerSize += 8 + chunkSize // Add chunk header and data size
1924
+ inputStream.skip(chunkSize.toLong()) // Skip chunk data
1925
+ }
1926
+
1927
+ inputStream.close()
1928
+ Log.d(Constants.TAG, "Total WAV header size: $headerSize bytes")
1929
+ return headerSize
1930
+
1931
+ } catch (e: Exception) {
1932
+ Log.e(Constants.TAG, "Error calculating WAV header size: ${e.message}")
1933
+ return null
1934
+ }
1935
+ }
1486
1936
  }