@siteed/expo-audio-stream 1.17.0 → 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (72) hide show
  1. package/CHANGELOG.md +21 -1
  2. package/README.md +1 -1
  3. package/android/src/main/java/net/siteed/audiostream/AudioAnalysisData.kt +68 -22
  4. package/android/src/main/java/net/siteed/audiostream/AudioFormatUtils.kt +24 -0
  5. package/android/src/main/java/net/siteed/audiostream/AudioProcessor.kt +836 -386
  6. package/android/src/main/java/net/siteed/audiostream/AudioRecorderManager.kt +0 -2
  7. package/android/src/main/java/net/siteed/audiostream/AudioRecordingService.kt +35 -29
  8. package/android/src/main/java/net/siteed/audiostream/ExpoAudioStreamModule.kt +236 -96
  9. package/android/src/main/java/net/siteed/audiostream/FFT.kt +55 -0
  10. package/android/src/main/java/net/siteed/audiostream/Features.kt +49 -7
  11. package/android/src/main/java/net/siteed/audiostream/RecordingConfig.kt +2 -4
  12. package/build/AudioAnalysis/AudioAnalysis.types.d.ts +55 -47
  13. package/build/AudioAnalysis/AudioAnalysis.types.d.ts.map +1 -1
  14. package/build/AudioAnalysis/AudioAnalysis.types.js.map +1 -1
  15. package/build/AudioAnalysis/extractAudioAnalysis.d.ts +60 -13
  16. package/build/AudioAnalysis/extractAudioAnalysis.d.ts.map +1 -1
  17. package/build/AudioAnalysis/extractAudioAnalysis.js +147 -162
  18. package/build/AudioAnalysis/extractAudioAnalysis.js.map +1 -1
  19. package/build/ExpoAudioStream.types.d.ts +47 -3
  20. package/build/ExpoAudioStream.types.d.ts.map +1 -1
  21. package/build/ExpoAudioStream.types.js.map +1 -1
  22. package/build/ExpoAudioStream.web.d.ts.map +1 -1
  23. package/build/ExpoAudioStream.web.js +0 -1
  24. package/build/ExpoAudioStream.web.js.map +1 -1
  25. package/build/ExpoAudioStreamModule.d.ts.map +1 -1
  26. package/build/ExpoAudioStreamModule.js +216 -12
  27. package/build/ExpoAudioStreamModule.js.map +1 -1
  28. package/build/WebRecorder.web.d.ts +67 -13
  29. package/build/WebRecorder.web.d.ts.map +1 -1
  30. package/build/WebRecorder.web.js +177 -173
  31. package/build/WebRecorder.web.js.map +1 -1
  32. package/build/index.d.ts +3 -3
  33. package/build/index.d.ts.map +1 -1
  34. package/build/index.js +2 -2
  35. package/build/index.js.map +1 -1
  36. package/build/useAudioRecorder.d.ts.map +1 -1
  37. package/build/useAudioRecorder.js +12 -8
  38. package/build/useAudioRecorder.js.map +1 -1
  39. package/build/utils/audioProcessing.d.ts +24 -0
  40. package/build/utils/audioProcessing.d.ts.map +1 -0
  41. package/build/utils/audioProcessing.js +133 -0
  42. package/build/utils/audioProcessing.js.map +1 -0
  43. package/build/workers/InlineFeaturesExtractor.web.d.ts +1 -1
  44. package/build/workers/InlineFeaturesExtractor.web.d.ts.map +1 -1
  45. package/build/workers/InlineFeaturesExtractor.web.js +694 -194
  46. package/build/workers/InlineFeaturesExtractor.web.js.map +1 -1
  47. package/build/workers/inlineAudioWebWorker.web.d.ts +1 -1
  48. package/build/workers/inlineAudioWebWorker.web.d.ts.map +1 -1
  49. package/build/workers/inlineAudioWebWorker.web.js +3 -2
  50. package/build/workers/inlineAudioWebWorker.web.js.map +1 -1
  51. package/ios/AudioAnalysisData.swift +51 -16
  52. package/ios/AudioProcessingHelpers.swift +710 -26
  53. package/ios/AudioProcessor.swift +334 -185
  54. package/ios/AudioStreamManager.swift +2 -3
  55. package/ios/DataPoint.swift +25 -12
  56. package/ios/DecodingConfig.swift +47 -0
  57. package/ios/ExpoAudioStreamModule.swift +187 -103
  58. package/ios/FFT.swift +62 -0
  59. package/ios/Features.swift +24 -3
  60. package/ios/RecordingSettings.swift +7 -7
  61. package/package.json +2 -1
  62. package/src/AudioAnalysis/AudioAnalysis.types.ts +68 -52
  63. package/src/AudioAnalysis/extractAudioAnalysis.ts +223 -219
  64. package/src/ExpoAudioStream.types.ts +53 -7
  65. package/src/ExpoAudioStream.web.ts +0 -1
  66. package/src/ExpoAudioStreamModule.ts +255 -10
  67. package/src/WebRecorder.web.ts +231 -244
  68. package/src/index.ts +5 -3
  69. package/src/useAudioRecorder.tsx +14 -10
  70. package/src/utils/audioProcessing.ts +205 -0
  71. package/src/workers/InlineFeaturesExtractor.web.tsx +694 -194
  72. package/src/workers/inlineAudioWebWorker.web.tsx +3 -2
@@ -1,4 +1,4 @@
1
- // AudioProcessor.swift
1
+ // packages/expo-audio-stream/ios/AudioProcessor.swift
2
2
 
3
3
  import Foundation
4
4
  import Accelerate
@@ -67,14 +67,24 @@ public class AudioProcessor {
67
67
  /// - numberOfSamples: The number of samples to extract (for waveform).
68
68
  /// - offset: The offset to start reading from (in samples).
69
69
  /// - length: The length of the audio to read (in samples).
70
- /// - pointsPerSecond: The number of data points to extract per second (for features).
71
- /// - algorithm: The algorithm to use for feature extraction.
70
+ /// - segmentDurationMs: The duration of each segment in milliseconds.
72
71
  /// - featureOptions: The features to extract.
73
72
  /// - bitDepth: The bit depth of the audio data.
74
73
  /// - numberOfChannels: The number of channels in the audio data.
74
+ /// - position: The position to start reading from (in bytes).
75
+ /// - byteLength: The length of the audio to read (in bytes).
75
76
  /// - Returns: An `AudioAnalysisData` object containing the extracted features.
76
- public func processAudioData(numberOfSamples: Int?, offset: Int? = 0, length: UInt? = nil, pointsPerSecond: Int?, algorithm: String, featureOptions: [String: Bool], bitDepth: Int, numberOfChannels: Int) -> AudioAnalysisData? {
77
-
77
+ public func processAudioData(
78
+ numberOfSamples: Int?,
79
+ offset: Int? = 0,
80
+ length: UInt? = nil,
81
+ segmentDurationMs: Int = 100, // Default 100ms
82
+ featureOptions: [String: Bool],
83
+ bitDepth: Int,
84
+ numberOfChannels: Int,
85
+ position: Int? = nil,
86
+ byteLength: Int? = nil
87
+ ) -> AudioAnalysisData? {
78
88
  guard let audioFile = audioFile else {
79
89
  reject("FILE_NOT_INITIALIZED", "Audio file is not initialized.")
80
90
  return nil
@@ -84,16 +94,69 @@ public class AudioProcessor {
84
94
  var framesPerBuffer: AVAudioFrameCount
85
95
  let actualPointsPerSecond: Int
86
96
 
97
+ NSLog("""
98
+ [AudioProcessor] Starting audio processing:
99
+ - totalFrameCount: \(totalFrameCount)
100
+ - bitDepth: \(bitDepth)
101
+ - numberOfChannels: \(numberOfChannels)
102
+ - position: \(position ?? -1)
103
+ - byteLength: \(byteLength ?? -1)
104
+ - offset: \(offset ?? -1)
105
+ - length: \(length ?? 0)
106
+ """)
107
+
108
+ // Use position/byteLength if provided, otherwise fall back to offset/length
109
+ let effectiveOffset: Int64 = if let position = position {
110
+ Int64(position / (bitDepth / 8) / numberOfChannels)
111
+ } else {
112
+ Int64(offset ?? 0)
113
+ }
114
+
115
+ let effectiveLength: Int64 = if let byteLength = byteLength {
116
+ Int64(byteLength / (bitDepth / 8) / numberOfChannels)
117
+ } else if let length = length {
118
+ Int64(length)
119
+ } else {
120
+ Int64(totalFrameCount) - effectiveOffset
121
+ }
122
+
123
+ NSLog("""
124
+ [AudioProcessor] Calculated frame positions:
125
+ - effectiveOffset: \(effectiveOffset)
126
+ - effectiveLength: \(effectiveLength)
127
+ - expectedEndFrame: \(effectiveOffset + effectiveLength)
128
+ - totalFrameCount: \(totalFrameCount)
129
+ """)
130
+
131
+ // Validate frame boundaries
132
+ if effectiveOffset < 0 || effectiveOffset >= Int64(totalFrameCount) {
133
+ NSLog("[AudioProcessor] ERROR: Invalid offset value")
134
+ reject("INVALID_OFFSET", "Offset value (\(effectiveOffset)) is outside valid range [0, \(totalFrameCount)]")
135
+ return nil
136
+ }
137
+
138
+ if effectiveLength <= 0 {
139
+ NSLog("[AudioProcessor] ERROR: Invalid length value")
140
+ reject("INVALID_LENGTH", "Length value (\(effectiveLength)) must be positive")
141
+ return nil
142
+ }
143
+
144
+ if effectiveOffset + effectiveLength > Int64(totalFrameCount) {
145
+ NSLog("[AudioProcessor] ERROR: Requested range exceeds file length")
146
+ reject("INVALID_RANGE", "Requested range [\(effectiveOffset), \(effectiveOffset + effectiveLength)] exceeds file length \(totalFrameCount)")
147
+ return nil
148
+ }
149
+
150
+ var startFrame: AVAudioFramePosition = effectiveOffset
151
+ let endFrame: AVAudioFramePosition = effectiveOffset + effectiveLength
152
+
153
+ // Calculate frames per segment based on segment duration
154
+ let framesPerSegment = AVAudioFrameCount(Float(audioFile.fileFormat.sampleRate) * Float(segmentDurationMs) / 1000.0)
155
+
87
156
  if let numberOfSamples = numberOfSamples {
88
- framesPerBuffer = totalFrameCount / AVAudioFrameCount(numberOfSamples)
89
- actualPointsPerSecond = Int(Double(totalFrameCount) / audioFile.fileFormat.sampleRate)
90
- } else if let pointsPerSecond = pointsPerSecond {
91
- actualPointsPerSecond = pointsPerSecond
92
- framesPerBuffer = totalFrameCount / AVAudioFrameCount(actualPointsPerSecond)
157
+ framesPerBuffer = AVAudioFrameCount(max(1, effectiveLength / Int64(numberOfSamples)))
93
158
  } else {
94
- // Default behavior: set pointsPerSecond to 1000
95
- actualPointsPerSecond = 1000
96
- framesPerBuffer = totalFrameCount / AVAudioFrameCount(actualPointsPerSecond)
159
+ framesPerBuffer = framesPerSegment
97
160
  }
98
161
 
99
162
  guard let buffer = AVAudioPCMBuffer(pcmFormat: audioFile.processingFormat, frameCapacity: framesPerBuffer) else {
@@ -104,11 +167,15 @@ public class AudioProcessor {
104
167
  channelCount = Int(audioFile.processingFormat.channelCount)
105
168
  var data = Array(repeating: [Float](repeating: 0, count: Int(framesPerBuffer)), count: channelCount)
106
169
 
107
- var startFrame: AVAudioFramePosition = offset == nil ? audioFile.framePosition : Int64(offset! * Int(framesPerBuffer))
108
- var endFrame: AVAudioFramePosition = length == nil ? audioFile.length : min(audioFile.length, startFrame + Int64(length!))
109
-
110
170
  var channelData = [Float]()
111
171
  while startFrame < endFrame {
172
+ let remainingFrames = endFrame - startFrame
173
+ let currentFramesPerBuffer = min(AVAudioFrameCount(framesPerBuffer), AVAudioFrameCount(remainingFrames))
174
+
175
+ if currentFramesPerBuffer <= 0 {
176
+ break
177
+ }
178
+
112
179
  if abortExtraction {
113
180
  audioFile.framePosition = startFrame
114
181
  abortExtraction = false
@@ -117,7 +184,7 @@ public class AudioProcessor {
117
184
 
118
185
  do {
119
186
  audioFile.framePosition = startFrame
120
- try audioFile.read(into: buffer, frameCount: framesPerBuffer)
187
+ try audioFile.read(into: buffer, frameCount: currentFramesPerBuffer)
121
188
  } catch {
122
189
  reject("AUDIO_READ_ERROR", "Couldn't read into buffer: \(error.localizedDescription)")
123
190
  return nil
@@ -132,26 +199,42 @@ public class AudioProcessor {
132
199
  channelData.append(floatData[0][frame])
133
200
  }
134
201
 
135
- startFrame += AVAudioFramePosition(framesPerBuffer)
136
- if startFrame + AVAudioFramePosition(framesPerBuffer) > endFrame {
137
- framesPerBuffer = AVAudioFrameCount(endFrame - startFrame)
138
- }
202
+ startFrame += AVAudioFramePosition(currentFramesPerBuffer)
139
203
  }
140
204
 
141
- return processChannelData(channelData: channelData, sampleRate: Float(audioFile.fileFormat.sampleRate), pointsPerSecond: actualPointsPerSecond, algorithm: algorithm, featureOptions: featureOptions, bitDepth: bitDepth, numberOfChannels: numberOfChannels)
205
+ NSLog("""
206
+ [AudioProcessor] Audio processing completed:
207
+ - processedFrames: \(endFrame - startFrame)
208
+ - framesPerBuffer: \(framesPerBuffer)
209
+ """)
210
+
211
+ return processChannelData(
212
+ channelData: channelData,
213
+ sampleRate: Float(audioFile.fileFormat.sampleRate),
214
+ segmentDurationMs: segmentDurationMs,
215
+ featureOptions: featureOptions,
216
+ bitDepth: bitDepth,
217
+ numberOfChannels: numberOfChannels
218
+ )
142
219
  }
143
220
 
144
221
  /// Processes audio data from a buffer.
145
222
  /// - Parameters:
146
223
  /// - data: The audio data buffer.
147
224
  /// - sampleRate: The sample rate of the audio data.
148
- /// - pointsPerSecond: The number of data points to extract per second (for features).
149
- /// - algorithm: The algorithm to use for feature extraction.
225
+ /// - segmentDurationMs: The duration of each segment in milliseconds.
150
226
  /// - featureOptions: The features to extract.
151
227
  /// - bitDepth: The bit depth of the audio data.
152
228
  /// - numberOfChannels: The number of channels in the audio data.
153
229
  /// - Returns: An `AudioAnalysisData` object containing the extracted features.
154
- public func processAudioBuffer(data: Data, sampleRate: Float, pointsPerSecond: Int, algorithm: String, featureOptions: [String: Bool], bitDepth: Int, numberOfChannels: Int) -> AudioAnalysisData? {
230
+ public func processAudioBuffer(
231
+ data: Data,
232
+ sampleRate: Float,
233
+ segmentDurationMs: Int,
234
+ featureOptions: [String: Bool],
235
+ bitDepth: Int,
236
+ numberOfChannels: Int
237
+ ) -> AudioAnalysisData? {
155
238
  guard !data.isEmpty else {
156
239
  Logger.debug("Data is empty, rejecting")
157
240
  reject("DATA_EMPTY", "The audio data is empty.")
@@ -177,121 +260,154 @@ public class AudioProcessor {
177
260
  return nil
178
261
  }
179
262
 
180
- return processChannelData(channelData: floatData, sampleRate: sampleRate, pointsPerSecond: pointsPerSecond, algorithm: algorithm, featureOptions: featureOptions, bitDepth: bitDepth, numberOfChannels: numberOfChannels)
263
+ return processChannelData(
264
+ channelData: floatData,
265
+ sampleRate: sampleRate,
266
+ segmentDurationMs: segmentDurationMs,
267
+ featureOptions: featureOptions,
268
+ bitDepth: bitDepth,
269
+ numberOfChannels: numberOfChannels
270
+ )
181
271
  }
182
272
 
183
273
  /// Processes the given audio channel data to extract features.
184
274
  /// - Parameters:
185
275
  /// - channelData: The audio channel data to process.
186
276
  /// - sampleRate: The sample rate of the audio data.
187
- /// - pointsPerSecond: The number of data points to extract per second (for features).
188
- /// - algorithm: The algorithm to use for feature extraction.
277
+ /// - segmentDurationMs: The duration of each segment in milliseconds.
189
278
  /// - featureOptions: The features to extract.
190
279
  /// - bitDepth: The bit depth of the audio data.
191
280
  /// - numberOfChannels: The number of channels in the audio data.
192
281
  /// - Returns: An `AudioAnalysisData` object containing the extracted features.
193
- private func processChannelData(channelData: [Float], sampleRate: Float, pointsPerSecond: Int, algorithm: String, featureOptions: [String: Bool], bitDepth: Int, numberOfChannels: Int) -> AudioAnalysisData? {
194
- Logger.debug("Processing audio data with sample rate: \(sampleRate), points per second: \(pointsPerSecond), algorithm: \(algorithm), bitDepth: \(bitDepth), numberOfChannels: \(numberOfChannels)")
282
+ private func processChannelData(
283
+ channelData: [Float],
284
+ sampleRate: Float,
285
+ segmentDurationMs: Int,
286
+ featureOptions: [String: Bool],
287
+ bitDepth: Int,
288
+ numberOfChannels: Int
289
+ ) -> AudioAnalysisData? {
290
+ Logger.debug("Processing audio data with sample rate: \(sampleRate), segmentDurationMs: \(segmentDurationMs), bitDepth: \(bitDepth), numberOfChannels: \(numberOfChannels)")
195
291
 
196
- let startTime = CACurrentMediaTime() // Start the timer with high precision
292
+ let startTime = CACurrentMediaTime()
197
293
 
198
294
  let length = channelData.count
199
- let pointInterval = Int(sampleRate) / pointsPerSecond
295
+ // Calculate points per segment based on segment duration
296
+ let samplesPerSegment = Int(Float(segmentDurationMs) * sampleRate / 1000.0)
200
297
  var dataPoints = [DataPoint]()
201
298
  var minAmplitude: Float = .greatestFiniteMagnitude
202
299
  var maxAmplitude: Float = -.greatestFiniteMagnitude
203
- let durationMs = Float(length) / sampleRate * 1000
204
-
205
- var sumSquares: Float = 0
206
- var zeroCrossings = 0
207
- var prevValue: Float = 0
208
- var localMinAmplitude: Float = .greatestFiniteMagnitude
209
- var localMaxAmplitude: Float = -.greatestFiniteMagnitude
210
- var segmentData = [Float]()
211
- var currentPosition = 0 // Track the current byte position
212
-
213
- for i in 0..<length {
214
- updateSegmentData(channelData: channelData, index: i, sumSquares: &sumSquares, zeroCrossings: &zeroCrossings, prevValue: &prevValue, localMinAmplitude: &localMinAmplitude, localMaxAmplitude: &localMaxAmplitude, segmentData: &segmentData)
300
+
301
+ // Calculate bytes per sample
302
+ let bytesPerSample = bitDepth / 8
303
+
304
+ // Process data in segments
305
+ var i = 0
306
+ while i < length {
307
+ let segmentEnd = min(i + samplesPerSegment, length)
308
+ let segment = Array(channelData[i..<segmentEnd])
215
309
 
216
- if (i + 1) % pointInterval == 0 || i == length - 1 {
217
- var features = computeFeatures(segmentData: segmentData, sampleRate: sampleRate, sumSquares: sumSquares, zeroCrossings: zeroCrossings, segmentLength: (i % pointInterval) + 1, featureOptions: featureOptions)
218
- features.minAmplitude = localMinAmplitude
219
- features.maxAmplitude = localMaxAmplitude
220
- let rms = features.rms
221
- let silent = rms < 0.01
222
- let dB = featureOptions["dB"] == true ? 20 * log10(rms) : 0
223
- minAmplitude = min(minAmplitude, localMinAmplitude)
224
- maxAmplitude = max(maxAmplitude, localMaxAmplitude)
225
-
226
- let segmentSize = segmentData.count
227
- let segmentDuration = Float(segmentSize) / sampleRate
228
-
229
- // Calculate start time and end time
230
- let segmentStartTime = Float(i - segmentSize + 1) / sampleRate
231
- let segmentEndTime = Float(i + 1) / sampleRate
232
-
233
- // Calculate start position and end position in bytes
234
- let bytesPerSample = bitDepth / 8
235
- let startPosition = currentPosition
236
- let endPosition = startPosition + (segmentSize * bytesPerSample * numberOfChannels)
237
-
238
- dataPoints.append(DataPoint(
239
- id: uniqueIdCounter, // Assign unique ID
240
- amplitude: algorithm == "peak" ? localMaxAmplitude : rms,
241
- activeSpeech: nil,
242
- dB: dB,
243
- silent: silent,
244
- features: features,
245
- startTime: segmentStartTime,
246
- endTime: segmentEndTime,
247
- startPosition: startPosition,
248
- endPosition: endPosition,
249
- speaker: 0
250
- ))
251
- uniqueIdCounter += 1 // Increment the unique ID counter
252
-
253
- resetSegmentData(&sumSquares, &zeroCrossings, &localMinAmplitude, &localMaxAmplitude, &segmentData)
254
-
255
- // Update the current byte position
256
- currentPosition = endPosition
257
- }
310
+ // Calculate byte positions and timing
311
+ let startPosition = i * bytesPerSample * numberOfChannels
312
+ let endPosition = segmentEnd * bytesPerSample * numberOfChannels
313
+ let startTime = Float(i) / sampleRate
314
+ let endTime = Float(segmentEnd) / sampleRate
315
+
316
+ // Process segment and create data point
317
+ let dataPoint = processSegment(
318
+ segment,
319
+ sampleRate: sampleRate,
320
+ featureOptions: featureOptions,
321
+ startTime: startTime,
322
+ endTime: endTime,
323
+ startPosition: startPosition,
324
+ endPosition: endPosition
325
+ )
326
+ dataPoints.append(dataPoint)
327
+
328
+ // Update min/max amplitudes
329
+ minAmplitude = min(minAmplitude, segment.min() ?? minAmplitude)
330
+ maxAmplitude = max(maxAmplitude, segment.max() ?? maxAmplitude)
331
+
332
+ i += samplesPerSegment
258
333
  }
259
334
 
260
- let endTime = CACurrentMediaTime() // End the timer with high precision
335
+ let endTime = CACurrentMediaTime()
261
336
  let processingTimeMs = Float((endTime - startTime) * 1000)
262
337
 
263
338
  Logger.debug("Processed \(dataPoints.count) data points in \(processingTimeMs) ms")
264
339
 
265
340
  return AudioAnalysisData(
266
- pointsPerSecond: pointsPerSecond,
267
- durationMs: Float(durationMs),
341
+ segmentDurationMs: segmentDurationMs,
342
+ durationMs: Int(Float(length) / sampleRate * 1000),
268
343
  bitDepth: bitDepth,
269
344
  numberOfChannels: numberOfChannels,
270
- sampleRate: sampleRate,
271
- samples: channelData.count,
345
+ sampleRate: Int(sampleRate),
346
+ samples: length,
272
347
  dataPoints: dataPoints,
273
- amplitudeRange: (min: minAmplitude, max: maxAmplitude),
274
- speakerChanges: [],
348
+ amplitudeRange: AudioAnalysisData.AmplitudeRange(
349
+ min: minAmplitude,
350
+ max: maxAmplitude
351
+ ),
352
+ rmsRange: AudioAnalysisData.AmplitudeRange(
353
+ min: 0,
354
+ max: 1
355
+ ),
356
+ speechAnalysis: nil,
275
357
  extractionTimeMs: processingTimeMs
276
358
  )
277
359
  }
278
360
 
279
- private func updateSegmentData(channelData: [Float], index: Int, sumSquares: inout Float, zeroCrossings: inout Int, prevValue: inout Float, localMinAmplitude: inout Float, localMaxAmplitude: inout Float, segmentData: inout [Float]) {
280
- let value = channelData[index]
281
- sumSquares += value * value
282
- if index > 0 && value * prevValue < 0 {
283
- zeroCrossings += 1
284
- }
285
- prevValue = value
361
+ private func processSegment(
362
+ _ segment: [Float],
363
+ sampleRate: Float,
364
+ featureOptions: [String: Bool],
365
+ startTime: Float,
366
+ endTime: Float,
367
+ startPosition: Int,
368
+ endPosition: Int
369
+ ) -> DataPoint {
370
+ let sumSquares: Float = segment.reduce(0) { $0 + $1 * $1 }
371
+ let rms = sqrt(sumSquares / Float(segment.count))
372
+ let silent = rms < 0.01
373
+ let dB = Float(20 * log10(Double(rms)))
374
+
375
+ let features = computeFeatures(
376
+ segmentData: segment,
377
+ sampleRate: sampleRate,
378
+ sumSquares: sumSquares,
379
+ zeroCrossings: 0,
380
+ segmentLength: segment.count,
381
+ featureOptions: featureOptions
382
+ )
286
383
 
287
- let absValue = abs(value)
288
- localMinAmplitude = min(localMinAmplitude, absValue)
289
- localMaxAmplitude = max(localMaxAmplitude, absValue)
290
384
 
291
- segmentData.append(value)
385
+ let dataPoint = DataPoint(
386
+ id: Int(uniqueIdCounter),
387
+ amplitude: segment.max() ?? 0,
388
+ rms: rms,
389
+ dB: dB,
390
+ silent: silent,
391
+ features: features,
392
+ speech: SpeechFeatures(isActive: !silent),
393
+ startTime: startTime,
394
+ endTime: endTime,
395
+ startPosition: startPosition,
396
+ endPosition: endPosition,
397
+ samples: segment.count
398
+ )
399
+ uniqueIdCounter += 1
400
+ return dataPoint
292
401
  }
293
402
 
294
- private func computeFeatures(segmentData: [Float], sampleRate: Float, sumSquares: Float, zeroCrossings: Int, segmentLength: Int, featureOptions: [String: Bool]) -> Features {
403
+ private func computeFeatures(
404
+ segmentData: [Float],
405
+ sampleRate: Float,
406
+ sumSquares: Float,
407
+ zeroCrossings: Int,
408
+ segmentLength: Int,
409
+ featureOptions: [String: Bool]
410
+ ) -> Features {
295
411
  let rms = sqrt(sumSquares / Float(segmentLength))
296
412
  let energy = featureOptions["energy"] == true ? sumSquares : 0
297
413
  let zcr = featureOptions["zcr"] == true ? Float(zeroCrossings) / Float(segmentLength) : 0
@@ -303,13 +419,24 @@ public class AudioProcessor {
303
419
  let chromagram = featureOptions["chromagram"] == true ? extractChromagram(from: segmentData, sampleRate: sampleRate) : []
304
420
  let tempo = featureOptions["tempo"] == true ? extractTempo(from: segmentData, sampleRate: sampleRate) : 0
305
421
  let hnr = featureOptions["hnr"] == true ? extractHNR(from: segmentData) : 0
422
+ let melSpectrogram = featureOptions["melSpectrogram"] == true ? computeMelSpectrogram(from: segmentData, sampleRate: sampleRate) : []
423
+ let spectralContrast = featureOptions["spectralContrast"] == true ? computeSpectralContrast(from: segmentData, sampleRate: sampleRate) : []
424
+ let tonnetz = featureOptions["tonnetz"] == true ? computeTonnetz(from: segmentData, sampleRate: sampleRate) : []
425
+ let pitch = featureOptions["pitch"] == true ? estimatePitch(from: segmentData, sampleRate: sampleRate) : 0
426
+
427
+ // Calculate min and max amplitudes from the segment data
428
+ let minAmplitude = segmentData.map(abs).min() ?? 0
429
+ let maxAmplitude = segmentData.map(abs).max() ?? 0
430
+
431
+ let crc32Value = featureOptions["crc32"] == true ?
432
+ calculateCRC32(from: segmentData, count: segmentData.count) : nil
306
433
 
307
434
  return Features(
308
435
  energy: energy,
309
436
  mfcc: mfcc,
310
437
  rms: rms,
311
- minAmplitude: 0, // computed before and will be overwritten
312
- maxAmplitude: 0, // computed before and will be overwritten
438
+ minAmplitude: minAmplitude,
439
+ maxAmplitude: maxAmplitude,
313
440
  zcr: zcr,
314
441
  spectralCentroid: spectralCentroid,
315
442
  spectralFlatness: spectralFlatness,
@@ -317,24 +444,20 @@ public class AudioProcessor {
317
444
  spectralBandwidth: spectralBandwidth,
318
445
  chromagram: chromagram,
319
446
  tempo: tempo,
320
- hnr: hnr
447
+ hnr: hnr,
448
+ melSpectrogram: melSpectrogram,
449
+ spectralContrast: spectralContrast,
450
+ tonnetz: tonnetz,
451
+ pitch: pitch,
452
+ crc32: crc32Value
321
453
  )
322
454
  }
323
455
 
324
- private func resetSegmentData(_ sumSquares: inout Float, _ zeroCrossings: inout Int, _ localMinAmplitude: inout Float, _ localMaxAmplitude: inout Float, _ segmentData: inout [Float]) {
325
- sumSquares = 0
326
- zeroCrossings = 0
327
- localMinAmplitude = .greatestFiniteMagnitude
328
- localMaxAmplitude = -.greatestFiniteMagnitude
329
- segmentData.removeAll()
330
- }
331
-
332
456
  /// Processes audio data with time range support
333
457
  public func processAudioData(
334
458
  startTimeMs: Double? = nil,
335
459
  endTimeMs: Double? = nil,
336
- pointsPerSecond: Int? = nil,
337
- algorithm: String,
460
+ segmentDurationMs: Int = 100, // Default 100ms
338
461
  featureOptions: [String: Bool]
339
462
  ) -> AudioAnalysisData? {
340
463
  guard let audioFile = audioFile else {
@@ -358,9 +481,8 @@ public class AudioProcessor {
358
481
  return nil
359
482
  }
360
483
 
361
- // Calculate frames per buffer based on points per second
362
- let actualPointsPerSecond = pointsPerSecond ?? 20
363
- let framesPerBuffer = AVAudioFrameCount((endFrame - startFrame) / Int64(actualPointsPerSecond))
484
+ // Calculate frames per buffer based on segment duration
485
+ let framesPerBuffer = AVAudioFrameCount(Float(sampleRate) * Float(segmentDurationMs) / 1000.0)
364
486
 
365
487
  guard let buffer = AVAudioPCMBuffer(pcmFormat: audioFile.processingFormat, frameCapacity: framesPerBuffer) else {
366
488
  Logger.debug("Failed to create buffer")
@@ -399,20 +521,18 @@ public class AudioProcessor {
399
521
  summedData[i] /= Float(numberOfChannels)
400
522
  }
401
523
 
402
- // Calculate amplitude based on algorithm
403
- let amplitude: Float
404
- if algorithm.lowercased() == "peak" {
405
- var localMax: Float = 0
406
- vDSP_maxmgv(summedData, 1, &localMax, vDSP_Length(framesToRead))
407
- amplitude = localMax
408
- } else {
409
- var rms: Float = 0
410
- vDSP_rmsqv(summedData, 1, &rms, vDSP_Length(framesToRead))
411
- amplitude = rms
412
- }
524
+ // Calculate both peak amplitude and RMS
525
+ var localMax: Float = 0
526
+ var rms: Float = 0
527
+ vDSP_maxmgv(summedData, 1, &localMax, vDSP_Length(framesToRead))
528
+
529
+ // Calculate RMS using vDSP
530
+ var meanSquare: Float = 0
531
+ vDSP_measqv(summedData, 1, &meanSquare, vDSP_Length(framesToRead))
532
+ rms = sqrt(meanSquare)
413
533
 
414
- minAmplitude = min(minAmplitude, amplitude)
415
- maxAmplitude = max(maxAmplitude, amplitude)
534
+ minAmplitude = min(minAmplitude, localMax)
535
+ maxAmplitude = max(maxAmplitude, localMax)
416
536
 
417
537
  // Create data point
418
538
  let startTime = Float(currentFrame) / Float(sampleRate)
@@ -420,11 +540,24 @@ public class AudioProcessor {
420
540
 
421
541
  let dataPoint = DataPoint(
422
542
  id: currentId,
423
- amplitude: amplitude,
543
+ amplitude: localMax, // Always use peak amplitude
544
+ rms: rms, // Use calculated RMS value
545
+ dB: Float(20 * log10(Double(rms))), // Use RMS for dB calculation
546
+ silent: rms < 0.01, // Use RMS for silence detection
547
+ features: computeFeatures(
548
+ segmentData: Array(UnsafeBufferPointer(start: summedData, count: Int(framesToRead))),
549
+ sampleRate: sampleRate,
550
+ sumSquares: rms * rms,
551
+ zeroCrossings: 0,
552
+ segmentLength: Int(framesToRead),
553
+ featureOptions: featureOptions
554
+ ),
555
+ speech: SpeechFeatures(isActive: rms >= 0.01),
424
556
  startTime: startTime,
425
557
  endTime: endTime,
426
558
  startPosition: Int(currentFrame),
427
- endPosition: Int(currentFrame + Int64(framesToRead))
559
+ endPosition: Int(currentFrame + Int64(framesToRead)),
560
+ samples: Int(framesToRead)
428
561
  )
429
562
 
430
563
  dataPoints.append(dataPoint)
@@ -441,51 +574,43 @@ public class AudioProcessor {
441
574
  let extractionTime = Float(endTime - startTime) * 1000 // Convert to milliseconds
442
575
 
443
576
  return AudioAnalysisData(
444
- pointsPerSecond: actualPointsPerSecond,
445
- durationMs: Float(endFrame - startFrame) * 1000 / Float(sampleRate),
577
+ segmentDurationMs: segmentDurationMs,
578
+ durationMs: Int(Float(endFrame - startFrame) * 1000 / sampleRate),
446
579
  bitDepth: bitDepth,
447
580
  numberOfChannels: numberOfChannels,
448
- sampleRate: sampleRate,
581
+ sampleRate: Int(sampleRate),
449
582
  samples: Int(endFrame - startFrame),
450
583
  dataPoints: dataPoints,
451
- amplitudeRange: (min: minAmplitude, max: maxAmplitude),
584
+ amplitudeRange: AudioAnalysisData.AmplitudeRange(
585
+ min: minAmplitude,
586
+ max: maxAmplitude
587
+ ),
588
+ rmsRange: AudioAnalysisData.AmplitudeRange(
589
+ min: 0,
590
+ max: 1
591
+ ),
592
+ speechAnalysis: nil,
452
593
  extractionTimeMs: extractionTime
453
594
  )
454
595
  }
455
596
 
456
- private func calculateZeroCrossingRate(_ data: [Float]) -> Float {
457
- var count: Float = 0
458
- for i in 1..<data.count {
459
- if (data[i] >= 0 && data[i-1] < 0) || (data[i] < 0 && data[i-1] >= 0) {
460
- count += 1
461
- }
462
- }
463
- return count / Float(data.count)
464
- }
465
-
466
- private func calculateEnergy(_ data: [Float]) -> Float {
467
- var energy: Float = 0
468
- vDSP_svesq(data, 1, &energy, vDSP_Length(data.count))
469
- return energy / Float(data.count)
470
- }
471
-
472
597
  /// Trims audio file to specified range
473
598
  public func trimAudio(
474
599
  startTimeMs: Double,
475
600
  endTimeMs: Double,
476
601
  outputFormat: [String: Any]?
477
602
  ) -> TrimResult? {
478
- guard let audioFile = audioFile else {
603
+ guard let currentAudioFile = audioFile else {
479
604
  Logger.debug("No audio file loaded")
480
605
  return nil
481
606
  }
482
607
 
483
- let sampleRate = audioFile.fileFormat.sampleRate
608
+ let sampleRate = currentAudioFile.fileFormat.sampleRate
484
609
  let startFrame = AVAudioFramePosition(startTimeMs * sampleRate / 1000.0)
485
610
  let endFrame = AVAudioFramePosition(endTimeMs * sampleRate / 1000.0)
486
611
 
487
612
  // Create output format
488
- let outputSettings = createOutputSettings(from: outputFormat, originalFormat: audioFile.fileFormat)
613
+ let outputSettings = createOutputSettings(from: outputFormat, originalFormat: currentAudioFile.fileFormat)
489
614
 
490
615
  // Create temporary output file
491
616
  let outputURL = FileManager.default.temporaryDirectory
@@ -503,11 +628,11 @@ public class AudioProcessor {
503
628
  // Read and write in chunks
504
629
  let bufferSize = 32768
505
630
  let buffer = AVAudioPCMBuffer(
506
- pcmFormat: audioFile.processingFormat,
631
+ pcmFormat: currentAudioFile.processingFormat,
507
632
  frameCapacity: AVAudioFrameCount(bufferSize)
508
633
  )!
509
634
 
510
- audioFile.framePosition = startFrame
635
+ currentAudioFile.framePosition = startFrame
511
636
  var currentFrame = startFrame
512
637
 
513
638
  while currentFrame < endFrame {
@@ -516,7 +641,7 @@ public class AudioProcessor {
516
641
  AVAudioFrameCount(endFrame - currentFrame)
517
642
  )
518
643
 
519
- try audioFile.read(into: buffer, frameCount: framesToRead)
644
+ try currentAudioFile.read(into: buffer, frameCount: framesToRead)
520
645
  try outputFile.write(from: buffer)
521
646
 
522
647
  currentFrame += Int64(framesToRead)
@@ -526,12 +651,18 @@ public class AudioProcessor {
526
651
  let attributes = try FileManager.default.attributesOfItem(atPath: outputURL.path)
527
652
  let fileSize = attributes[.size] as! Int64
528
653
 
529
- return TrimResult(
654
+ // After successful trim, update the class property
655
+ audioFile = try AVAudioFile(forReading: outputURL)
656
+
657
+ // After successful trim, create the result
658
+ let trimmedDuration = (endTimeMs - startTimeMs) / 1000.0 // Convert to seconds
659
+ let result = TrimResult(
530
660
  uri: outputURL.absoluteString,
531
- duration: Double(endFrame - startFrame) / sampleRate,
661
+ duration: trimmedDuration, // Use actual trimmed duration
532
662
  size: fileSize
533
663
  )
534
664
 
665
+ return result
535
666
  } catch {
536
667
  Logger.debug("Error trimming audio: \(error)")
537
668
  return nil
@@ -561,10 +692,14 @@ public class AudioProcessor {
561
692
  /// - numberOfPoints: The number of points to extract
562
693
  /// - startTimeMs: Optional start time in milliseconds
563
694
  /// - endTimeMs: Optional end time in milliseconds
564
- /// - algorithm: The algorithm to use for feature extraction
565
695
  /// - featureOptions: The features to extract
566
696
  /// - Returns: An `AudioAnalysisData` object containing the extracted features
567
- public func extractPreview(numberOfPoints: Int, startTimeMs: Double? = nil, endTimeMs: Double? = nil, algorithm: String, featureOptions: [String: Bool]) -> AudioAnalysisData? {
697
+ public func extractPreview(
698
+ numberOfPoints: Int,
699
+ startTimeMs: Double? = nil,
700
+ endTimeMs: Double? = nil,
701
+ featureOptions: [String: Bool]
702
+ ) -> AudioAnalysisData? {
568
703
  guard let audioFile = audioFile else {
569
704
  reject("FILE_NOT_INITIALIZED", "Audio file is not initialized.")
570
705
  return nil
@@ -576,9 +711,9 @@ public class AudioProcessor {
576
711
  // Calculate effective time range
577
712
  let effectiveStartMs = startTimeMs ?? 0.0
578
713
  let effectiveEndMs = min(endTimeMs ?? totalDurationMs, totalDurationMs)
579
- let durationMs = effectiveEndMs - effectiveStartMs
714
+ let durationMs = effectiveEndMs - effectiveStartMs // This is the actual duration we want to use
580
715
 
581
- // Convert time to frames
716
+ // Convert time to frames with proper offset
582
717
  let startFrame = AVAudioFramePosition(effectiveStartMs * Double(sampleRate) / 1000.0)
583
718
  let endFrame = AVAudioFramePosition(effectiveEndMs * Double(sampleRate) / 1000.0)
584
719
  let samplesInRange = Int(endFrame - startFrame)
@@ -596,11 +731,19 @@ public class AudioProcessor {
596
731
  var minAmplitude: Float = .greatestFiniteMagnitude
597
732
  var maxAmplitude: Float = -.greatestFiniteMagnitude
598
733
 
734
+ let bytesPerSample = audioFile.fileFormat.settings[AVLinearPCMBitDepthKey] as? Int ?? 16 / 8
735
+
599
736
  for i in 0..<numberOfPoints {
600
737
  let pointStartFrame = startFrame + Int64(i * samplesPerPoint)
601
738
  let pointEndFrame = startFrame + Int64((i + 1) * samplesPerPoint)
602
739
  let framesToRead = AVAudioFrameCount(pointEndFrame - pointStartFrame)
603
740
 
741
+ // Calculate byte positions
742
+ let startPosition = Int(pointStartFrame) * bytesPerSample * Int(audioFile.fileFormat.channelCount)
743
+ let endPosition = Int(pointEndFrame) * bytesPerSample * Int(audioFile.fileFormat.channelCount)
744
+ let segmentStartTime = Float(pointStartFrame) / sampleRate
745
+ let segmentEndTime = Float(pointEndFrame) / sampleRate
746
+
604
747
  do {
605
748
  audioFile.framePosition = pointStartFrame
606
749
  let buffer = AVAudioPCMBuffer(pcmFormat: audioFile.processingFormat, frameCapacity: framesToRead)!
@@ -637,24 +780,23 @@ public class AudioProcessor {
637
780
 
638
781
  let rms = features.rms
639
782
  let silent = rms < 0.01
640
- let dB = featureOptions["dB"] == true ? 20 * log10(rms) : 0
641
-
642
- let segmentStartTime = Float(pointStartFrame) / sampleRate
643
- let segmentEndTime = Float(pointEndFrame) / sampleRate
783
+ let dB = Float(20 * log10(Double(rms)))
644
784
 
645
- dataPoints.append(DataPoint(
646
- id: uniqueIdCounter,
647
- amplitude: algorithm == "peak" ? localMaxAmplitude : rms,
648
- activeSpeech: nil,
785
+ let dataPoint = DataPoint(
786
+ id: Int(uniqueIdCounter),
787
+ amplitude: localMaxAmplitude,
788
+ rms: rms,
649
789
  dB: dB,
650
790
  silent: silent,
651
791
  features: features,
792
+ speech: SpeechFeatures(isActive: !silent),
652
793
  startTime: segmentStartTime,
653
794
  endTime: segmentEndTime,
654
- startPosition: Int(pointStartFrame),
655
- endPosition: Int(pointEndFrame),
656
- speaker: 0
657
- ))
795
+ startPosition: startPosition,
796
+ endPosition: endPosition,
797
+ samples: Int(framesToRead)
798
+ )
799
+ dataPoints.append(dataPoint)
658
800
  uniqueIdCounter += 1
659
801
 
660
802
  minAmplitude = min(minAmplitude, localMinAmplitude)
@@ -694,15 +836,22 @@ public class AudioProcessor {
694
836
  """)
695
837
 
696
838
  return AudioAnalysisData(
697
- pointsPerSecond: numberOfPoints,
698
- durationMs: Float(durationMs),
839
+ segmentDurationMs: 100, // Default 100ms
840
+ durationMs: Int(durationMs), // Use actual duration of trimmed section
699
841
  bitDepth: bitDepth,
700
842
  numberOfChannels: numberOfChannels,
701
- sampleRate: sampleRate,
843
+ sampleRate: Int(sampleRate),
702
844
  samples: samplesInRange,
703
845
  dataPoints: dataPoints,
704
- amplitudeRange: (min: minAmplitude, max: maxAmplitude),
705
- speakerChanges: [],
846
+ amplitudeRange: AudioAnalysisData.AmplitudeRange(
847
+ min: minAmplitude,
848
+ max: maxAmplitude
849
+ ),
850
+ rmsRange: AudioAnalysisData.AmplitudeRange(
851
+ min: 0,
852
+ max: 1
853
+ ),
854
+ speechAnalysis: nil,
706
855
  extractionTimeMs: extractionTimeMs
707
856
  )
708
857
  }