react-native-sherpa-onnx 0.3.2 → 0.3.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +28 -15
- package/SherpaOnnx.podspec +13 -5
- package/android/prebuilt-download.gradle +18 -5
- package/android/prebuilt-versions.gradle +8 -4
- package/android/src/main/cpp/jni/model_detect/sherpa-onnx-model-detect-helper.cpp +43 -142
- package/android/src/main/cpp/jni/model_detect/sherpa-onnx-model-detect-helper.h +12 -4
- package/android/src/main/cpp/jni/model_detect/sherpa-onnx-model-detect-stt.cpp +694 -307
- package/android/src/main/cpp/jni/model_detect/sherpa-onnx-model-detect-tts.cpp +194 -99
- package/android/src/main/cpp/jni/model_detect/sherpa-onnx-model-detect.h +90 -0
- package/android/src/main/cpp/jni/model_detect/sherpa-onnx-stt-wrapper.cpp +3 -0
- package/android/src/main/java/com/sherpaonnx/SherpaOnnxModule.kt +70 -0
- package/android/src/main/java/com/sherpaonnx/SherpaOnnxPcmCapture.kt +150 -0
- package/android/src/main/java/com/sherpaonnx/SherpaOnnxSttHelper.kt +39 -19
- package/ios/SherpaOnnx+PcmLiveStream.mm +288 -0
- package/ios/SherpaOnnx+STT.mm +2 -0
- package/ios/SherpaOnnx.mm +1 -1
- package/ios/model_detect/sherpa-onnx-model-detect-helper.h +9 -3
- package/ios/model_detect/sherpa-onnx-model-detect-helper.mm +38 -54
- package/ios/model_detect/sherpa-onnx-model-detect-stt.mm +620 -267
- package/ios/model_detect/sherpa-onnx-model-detect-tts.mm +131 -28
- package/ios/model_detect/sherpa-onnx-model-detect.h +70 -0
- package/ios/stt/sherpa-onnx-stt-wrapper.mm +4 -0
- package/lib/module/NativeSherpaOnnx.js.map +1 -1
- package/lib/module/audio/index.js +52 -0
- package/lib/module/audio/index.js.map +1 -1
- package/lib/module/stt/streaming.js +6 -3
- package/lib/module/stt/streaming.js.map +1 -1
- package/lib/typescript/src/NativeSherpaOnnx.d.ts +16 -2
- package/lib/typescript/src/NativeSherpaOnnx.d.ts.map +1 -1
- package/lib/typescript/src/audio/index.d.ts +17 -0
- package/lib/typescript/src/audio/index.d.ts.map +1 -1
- package/lib/typescript/src/stt/streaming.d.ts.map +1 -1
- package/lib/typescript/src/stt/streamingTypes.d.ts +1 -1
- package/lib/typescript/src/stt/streamingTypes.d.ts.map +1 -1
- package/package.json +6 -1
- package/scripts/check-model-csvs.sh +72 -0
- package/scripts/setup-ios-framework.sh +48 -48
- package/src/NativeSherpaOnnx.ts +18 -2
- package/src/audio/index.ts +81 -0
- package/src/stt/streaming.ts +10 -5
- package/src/stt/streamingTypes.ts +1 -1
- package/third_party/sherpa-onnx-prebuilt/ANDROID_RELEASE_TAG +1 -1
- package/third_party/sherpa-onnx-prebuilt/IOS_RELEASE_TAG +1 -1
|
@@ -0,0 +1,150 @@
|
|
|
1
|
+
package com.sherpaonnx
|
|
2
|
+
|
|
3
|
+
import android.media.AudioFormat
|
|
4
|
+
import android.media.AudioRecord
|
|
5
|
+
import android.media.MediaRecorder
|
|
6
|
+
import android.util.Base64
|
|
7
|
+
import android.util.Log
|
|
8
|
+
import java.nio.ByteBuffer
|
|
9
|
+
import java.nio.ByteOrder
|
|
10
|
+
import kotlin.concurrent.thread
|
|
11
|
+
import kotlin.math.round
|
|
12
|
+
|
|
13
|
+
/**
|
|
14
|
+
* Native PCM capture from the microphone with optional resampling to a target sample rate.
|
|
15
|
+
* Captures at a supported hardware rate (e.g. 44100 or 48000 Hz), then resamples to the
|
|
16
|
+
* requested rate so the app always receives PCM at the same sample rate (e.g. 16000 for STT).
|
|
17
|
+
*/
|
|
18
|
+
class SherpaOnnxPcmCapture(
|
|
19
|
+
private val targetSampleRate: Int,
|
|
20
|
+
private val channelCount: Int,
|
|
21
|
+
private val bufferSizeFrames: Int,
|
|
22
|
+
private val onChunk: (base64Pcm: String, sampleRate: Int) -> Unit,
|
|
23
|
+
private val onError: (message: String) -> Unit,
|
|
24
|
+
private val logTag: String = "SherpaOnnxPcmCapture"
|
|
25
|
+
) {
|
|
26
|
+
private var audioRecord: AudioRecord? = null
|
|
27
|
+
@Volatile
|
|
28
|
+
private var running = false
|
|
29
|
+
private var captureThread: Thread? = null
|
|
30
|
+
|
|
31
|
+
companion object {
|
|
32
|
+
/** Supported capture sample rates to try in order (device-dependent). */
|
|
33
|
+
private val CAPTURE_RATES = intArrayOf(16000, 44100, 48000)
|
|
34
|
+
|
|
35
|
+
/**
|
|
36
|
+
* Resample Int16 PCM from capture rate to target rate using linear interpolation.
|
|
37
|
+
* Returns a new ByteArray of Int16 samples at target rate.
|
|
38
|
+
*/
|
|
39
|
+
private fun resampleInt16(
|
|
40
|
+
input: ShortArray,
|
|
41
|
+
fromRate: Int,
|
|
42
|
+
toRate: Int
|
|
43
|
+
): ShortArray {
|
|
44
|
+
if (fromRate == toRate) return input
|
|
45
|
+
val ratio = fromRate.toDouble() / toRate
|
|
46
|
+
val outLength = round(input.size / ratio).toInt().coerceAtLeast(0)
|
|
47
|
+
val result = ShortArray(outLength)
|
|
48
|
+
for (i in 0 until outLength) {
|
|
49
|
+
val srcIdx = i * ratio
|
|
50
|
+
val idx0 = srcIdx.toInt().coerceIn(0, input.size - 1)
|
|
51
|
+
val idx1 = (idx0 + 1).coerceAtMost(input.size - 1)
|
|
52
|
+
val frac = (srcIdx - idx0).toFloat()
|
|
53
|
+
val v0 = input[idx0].toInt()
|
|
54
|
+
val v1 = input[idx1].toInt()
|
|
55
|
+
result[i] = (v0 + (v1 - v0) * frac).toInt().toShort()
|
|
56
|
+
}
|
|
57
|
+
return result
|
|
58
|
+
}
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
/**
|
|
62
|
+
* Start capture. Uses a supported hardware rate and resamples to [targetSampleRate] before emitting.
|
|
63
|
+
*/
|
|
64
|
+
fun start() {
|
|
65
|
+
if (running) {
|
|
66
|
+
Log.w(logTag, "start: already running")
|
|
67
|
+
return
|
|
68
|
+
}
|
|
69
|
+
val bufferSizeBytes = if (bufferSizeFrames > 0) {
|
|
70
|
+
bufferSizeFrames * 2 // 2 bytes per sample (16-bit mono)
|
|
71
|
+
} else {
|
|
72
|
+
(0.1 * targetSampleRate).toInt() * 2 // 0.1 s default (16-bit mono)
|
|
73
|
+
}
|
|
74
|
+
val captureRate = CAPTURE_RATES.firstOrNull { rate ->
|
|
75
|
+
val size = AudioRecord.getMinBufferSize(rate, AudioFormat.CHANNEL_IN_MONO, AudioFormat.ENCODING_PCM_16BIT)
|
|
76
|
+
size != AudioRecord.ERROR && size != AudioRecord.ERROR_BAD_VALUE
|
|
77
|
+
} ?: 44100
|
|
78
|
+
val minBuf = AudioRecord.getMinBufferSize(captureRate, AudioFormat.CHANNEL_IN_MONO, AudioFormat.ENCODING_PCM_16BIT)
|
|
79
|
+
val bufSize = minBuf.coerceAtLeast(bufferSizeBytes)
|
|
80
|
+
val record = try {
|
|
81
|
+
AudioRecord(
|
|
82
|
+
MediaRecorder.AudioSource.VOICE_RECOGNITION,
|
|
83
|
+
captureRate,
|
|
84
|
+
AudioFormat.CHANNEL_IN_MONO,
|
|
85
|
+
AudioFormat.ENCODING_PCM_16BIT,
|
|
86
|
+
bufSize
|
|
87
|
+
)
|
|
88
|
+
} catch (e: SecurityException) {
|
|
89
|
+
Log.e(logTag, "start: RECORD_AUDIO permission not granted", e)
|
|
90
|
+
onError("RECORD_AUDIO permission not granted")
|
|
91
|
+
return
|
|
92
|
+
}
|
|
93
|
+
if (record.state != AudioRecord.STATE_INITIALIZED) {
|
|
94
|
+
Log.e(logTag, "start: AudioRecord not initialized")
|
|
95
|
+
onError("AudioRecord failed to initialize")
|
|
96
|
+
record.release()
|
|
97
|
+
return
|
|
98
|
+
}
|
|
99
|
+
audioRecord = record
|
|
100
|
+
running = true
|
|
101
|
+
captureThread = thread(name = "SherpaOnnxPcmCapture") {
|
|
102
|
+
val shortBuf = ShortArray(bufSize / 2)
|
|
103
|
+
try {
|
|
104
|
+
record.startRecording()
|
|
105
|
+
while (running && record.recordingState == AudioRecord.RECORDSTATE_RECORDING) {
|
|
106
|
+
val read = record.read(shortBuf, 0, shortBuf.size)
|
|
107
|
+
if (read <= 0) continue
|
|
108
|
+
val chunk = shortBuf.copyOf(read)
|
|
109
|
+
val toEmit = if (captureRate != targetSampleRate) {
|
|
110
|
+
resampleInt16(chunk, captureRate, targetSampleRate)
|
|
111
|
+
} else {
|
|
112
|
+
chunk
|
|
113
|
+
}
|
|
114
|
+
val byteBuf = ByteBuffer.allocate(toEmit.size * 2).order(ByteOrder.LITTLE_ENDIAN)
|
|
115
|
+
for (s in toEmit) byteBuf.putShort(s)
|
|
116
|
+
val base64 = Base64.encodeToString(byteBuf.array(), Base64.NO_WRAP)
|
|
117
|
+
onChunk(base64, targetSampleRate)
|
|
118
|
+
}
|
|
119
|
+
} catch (e: Exception) {
|
|
120
|
+
if (running) {
|
|
121
|
+
Log.e(logTag, "Capture thread error", e)
|
|
122
|
+
onError(e.message ?: "Capture error")
|
|
123
|
+
}
|
|
124
|
+
} finally {
|
|
125
|
+
try {
|
|
126
|
+
record.stop()
|
|
127
|
+
} catch (_: Exception) { }
|
|
128
|
+
record.release()
|
|
129
|
+
audioRecord = null
|
|
130
|
+
}
|
|
131
|
+
}
|
|
132
|
+
}
|
|
133
|
+
|
|
134
|
+
/** Stop capture and release resources. */
|
|
135
|
+
fun stop() {
|
|
136
|
+
running = false
|
|
137
|
+
// Actively stop AudioRecord to unblock any pending read()
|
|
138
|
+
val record = audioRecord
|
|
139
|
+
if (record != null) {
|
|
140
|
+
try {
|
|
141
|
+
record.stop()
|
|
142
|
+
} catch (_: Exception) {
|
|
143
|
+
// Ignore; the capture thread's finally block also handles stop/release safely
|
|
144
|
+
}
|
|
145
|
+
}
|
|
146
|
+
captureThread?.join(2000)
|
|
147
|
+
captureThread = null
|
|
148
|
+
audioRecord = null
|
|
149
|
+
}
|
|
150
|
+
}
|
|
@@ -2,6 +2,7 @@ package com.sherpaonnx
|
|
|
2
2
|
|
|
3
3
|
import android.content.Context
|
|
4
4
|
import android.net.Uri
|
|
5
|
+
import android.os.HandlerThread
|
|
5
6
|
import android.util.Log
|
|
6
7
|
import com.facebook.react.bridge.Arguments
|
|
7
8
|
import com.facebook.react.bridge.Promise
|
|
@@ -51,6 +52,9 @@ internal class SherpaOnnxSttHelper(
|
|
|
51
52
|
|
|
52
53
|
private val instances = ConcurrentHashMap<String, SttEngineInstance>()
|
|
53
54
|
|
|
55
|
+
private val initThread = HandlerThread("stt-init").also { it.start() }
|
|
56
|
+
private val initHandler = android.os.Handler(initThread.looper)
|
|
57
|
+
|
|
54
58
|
private fun getInstance(instanceId: String): SttEngineInstance? = instances[instanceId]
|
|
55
59
|
|
|
56
60
|
/** Hotwords are supported for transducer and NeMo transducer models (sherpa-onnx; NeMo: https://github.com/k2-fsa/sherpa-onnx/pull/3077). */
|
|
@@ -277,26 +281,33 @@ internal class SherpaOnnxSttHelper(
|
|
|
277
281
|
)
|
|
278
282
|
inst.lastRecognizerConfig = config
|
|
279
283
|
inst.currentSttModelType = modelTypeStr
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
|
|
292
|
-
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
|
|
284
|
+
// Defer recognizer creation to the dedicated background thread so release() of the previous
|
|
285
|
+
// recognizer can complete off the UI thread (avoids "destroyed mutex" / SIGSEGV when switching models).
|
|
286
|
+
initHandler.post {
|
|
287
|
+
try {
|
|
288
|
+
inst.recognizer = OfflineRecognizer(config = config)
|
|
289
|
+
val resultMap = Arguments.createMap()
|
|
290
|
+
resultMap.putBoolean("success", true)
|
|
291
|
+
resultMap.putString("modelType", modelTypeStr)
|
|
292
|
+
resultMap.putString("decodingMethod", config.decodingMethod)
|
|
293
|
+
val detectedModelsArray = Arguments.createArray()
|
|
294
|
+
for (model in detectedModels) {
|
|
295
|
+
val modelMap = model as? HashMap<*, *>
|
|
296
|
+
if (modelMap != null) {
|
|
297
|
+
val modelResultMap = Arguments.createMap()
|
|
298
|
+
modelResultMap.putString("type", modelMap["type"] as? String ?: "")
|
|
299
|
+
modelResultMap.putString("modelDir", modelMap["modelDir"] as? String ?: "")
|
|
300
|
+
detectedModelsArray.pushMap(modelResultMap)
|
|
301
|
+
}
|
|
302
|
+
}
|
|
303
|
+
resultMap.putArray("detectedModels", detectedModelsArray)
|
|
304
|
+
promise.resolve(resultMap)
|
|
305
|
+
} catch (e: Exception) {
|
|
306
|
+
val errorMsg = "Exception creating recognizer: ${e.message ?: e.javaClass.simpleName}"
|
|
307
|
+
Log.e(logTag, errorMsg, e)
|
|
308
|
+
promise.reject("INIT_ERROR", errorMsg, e)
|
|
296
309
|
}
|
|
297
310
|
}
|
|
298
|
-
resultMap.putArray("detectedModels", detectedModelsArray)
|
|
299
|
-
promise.resolve(resultMap)
|
|
300
311
|
} catch (e: Exception) {
|
|
301
312
|
val errorMsg = "Exception during initialization: ${e.message ?: e.javaClass.simpleName}"
|
|
302
313
|
Log.e(logTag, errorMsg, e)
|
|
@@ -588,7 +599,16 @@ internal class SherpaOnnxSttHelper(
|
|
|
588
599
|
preprocessor = path(paths, "moonshinePreprocessor"),
|
|
589
600
|
encoder = path(paths, "moonshineEncoder"),
|
|
590
601
|
uncachedDecoder = path(paths, "moonshineUncachedDecoder"),
|
|
591
|
-
cachedDecoder = path(paths, "moonshineCachedDecoder")
|
|
602
|
+
cachedDecoder = path(paths, "moonshineCachedDecoder"),
|
|
603
|
+
mergedDecoder = ""
|
|
604
|
+
),
|
|
605
|
+
tokens = path(paths, "tokens"),
|
|
606
|
+
modelType = "moonshine"
|
|
607
|
+
)
|
|
608
|
+
"moonshine_v2" -> OfflineModelConfig(
|
|
609
|
+
moonshine = OfflineMoonshineModelConfig(
|
|
610
|
+
encoder = path(paths, "moonshineEncoder"),
|
|
611
|
+
mergedDecoder = path(paths, "moonshineMergedDecoder")
|
|
592
612
|
),
|
|
593
613
|
tokens = path(paths, "tokens"),
|
|
594
614
|
modelType = "moonshine"
|
|
@@ -0,0 +1,288 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* SherpaOnnx+PcmLiveStream.mm
|
|
3
|
+
*
|
|
4
|
+
* Native PCM live capture from the microphone via Audio Queue API (AudioQueueNewInput).
|
|
5
|
+
* Captures at a supported hardware rate (16000, 44100, 48000), resamples to the requested
|
|
6
|
+
* target rate, and emits pcmLiveStreamData at target rate (same behavior as Android).
|
|
7
|
+
*/
|
|
8
|
+
|
|
9
|
+
#import "SherpaOnnx.h"
|
|
10
|
+
#import <AVFoundation/AVFoundation.h>
|
|
11
|
+
#import <AudioToolbox/AudioToolbox.h>
|
|
12
|
+
#import <React/RCTLog.h>
|
|
13
|
+
#import <stdlib.h>
|
|
14
|
+
|
|
15
|
+
static const UInt32 kPcmLiveAQNumberBuffers = 3;
|
|
16
|
+
/** Capture sample rates to try in order (match Android CAPTURE_RATES). */
|
|
17
|
+
static const int kPcmLiveCaptureRates[] = { 16000, 44100, 48000 };
|
|
18
|
+
static const size_t kPcmLiveCaptureRatesCount = sizeof(kPcmLiveCaptureRates) / sizeof(kPcmLiveCaptureRates[0]);
|
|
19
|
+
|
|
20
|
+
static NSInteger _pcmLiveTargetSampleRate = 16000;
|
|
21
|
+
static NSInteger _pcmLiveCaptureRate = 16000;
|
|
22
|
+
static __weak SherpaOnnx *_pcmLiveModule = nil;
|
|
23
|
+
static AudioQueueRef _pcmLiveAudioQueue = NULL;
|
|
24
|
+
static AudioQueueBufferRef _pcmLiveAQBuffers[kPcmLiveAQNumberBuffers];
|
|
25
|
+
static volatile BOOL _pcmLiveAQRunning = NO;
|
|
26
|
+
|
|
27
|
+
static void emitPcmChunk(SherpaOnnx *module, const int16_t *samples, NSUInteger count, NSInteger sampleRate) {
|
|
28
|
+
if (!module || count == 0) return;
|
|
29
|
+
// Copy samples into NSData on the AudioQueue callback thread so the data
|
|
30
|
+
// remains valid after the audio buffer is reused.
|
|
31
|
+
NSData *data = [NSData dataWithBytes:samples length:count * sizeof(int16_t)];
|
|
32
|
+
// Dispatch the React Native event emission to the main queue to avoid
|
|
33
|
+
// bridge thread-safety issues.
|
|
34
|
+
dispatch_async(dispatch_get_main_queue(), ^{
|
|
35
|
+
NSString *base64 = [data base64EncodedStringWithOptions:0];
|
|
36
|
+
[module sendEventWithName:@"pcmLiveStreamData"
|
|
37
|
+
body:@{ @"base64Pcm": base64, @"sampleRate": @(sampleRate) }];
|
|
38
|
+
});
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
static void emitPcmError(SherpaOnnx *module, NSString *message) {
|
|
42
|
+
if (!module) return;
|
|
43
|
+
// Dispatch error events to the main queue to match other RN event patterns
|
|
44
|
+
// and avoid bridge thread-safety issues.
|
|
45
|
+
dispatch_async(dispatch_get_main_queue(), ^{
|
|
46
|
+
[module sendEventWithName:@"pcmLiveStreamError" body:@{ @"message": message ?: @"" }];
|
|
47
|
+
});
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
/** Resample Int16 PCM from fromRate to toRate using linear interpolation (match Android resampleInt16). */
|
|
51
|
+
static NSUInteger pcmLiveResampleInt16(const int16_t *input, NSUInteger inputFrames,
|
|
52
|
+
int fromRate, int toRate,
|
|
53
|
+
int16_t *output, size_t outputCapacity) {
|
|
54
|
+
if (fromRate == toRate) {
|
|
55
|
+
size_t copy = (inputFrames < outputCapacity) ? inputFrames : outputCapacity;
|
|
56
|
+
memcpy(output, input, copy * sizeof(int16_t));
|
|
57
|
+
return copy;
|
|
58
|
+
}
|
|
59
|
+
double ratio = (double)fromRate / (double)toRate;
|
|
60
|
+
NSUInteger outLength = (NSUInteger)((double)inputFrames / ratio);
|
|
61
|
+
if (outLength > outputCapacity) outLength = outputCapacity;
|
|
62
|
+
if (outLength == 0) return 0;
|
|
63
|
+
for (NSUInteger i = 0; i < outLength; i++) {
|
|
64
|
+
double srcIdx = (double)i * ratio;
|
|
65
|
+
NSUInteger idx0 = (NSUInteger)srcIdx;
|
|
66
|
+
if (idx0 >= inputFrames) idx0 = inputFrames - 1;
|
|
67
|
+
NSUInteger idx1 = idx0 + 1;
|
|
68
|
+
if (idx1 >= inputFrames) idx1 = inputFrames - 1;
|
|
69
|
+
float frac = (float)(srcIdx - (double)idx0);
|
|
70
|
+
int v0 = (int)input[idx0];
|
|
71
|
+
int v1 = (int)input[idx1];
|
|
72
|
+
int v = (int)(v0 + (v1 - v0) * frac);
|
|
73
|
+
if (v < -32768) v = -32768;
|
|
74
|
+
if (v > 32767) v = 32767;
|
|
75
|
+
output[i] = (int16_t)v;
|
|
76
|
+
}
|
|
77
|
+
return outLength;
|
|
78
|
+
}
|
|
79
|
+
|
|
80
|
+
static void pcmLiveAQInputCallback(void *inUserData,
|
|
81
|
+
AudioQueueRef inAQ,
|
|
82
|
+
AudioQueueBufferRef inBuffer,
|
|
83
|
+
const AudioTimeStamp *inStartTime,
|
|
84
|
+
UInt32 inNumPackets,
|
|
85
|
+
const AudioStreamPacketDescription *inPacketDesc) {
|
|
86
|
+
(void)inUserData;
|
|
87
|
+
(void)inStartTime;
|
|
88
|
+
(void)inNumPackets;
|
|
89
|
+
(void)inPacketDesc;
|
|
90
|
+
if (!_pcmLiveAQRunning) return;
|
|
91
|
+
SherpaOnnx *module = _pcmLiveModule;
|
|
92
|
+
if (!module) return;
|
|
93
|
+
UInt32 byteSize = inBuffer->mAudioDataByteSize;
|
|
94
|
+
if (byteSize == 0) {
|
|
95
|
+
AudioQueueEnqueueBuffer(inAQ, inBuffer, 0, NULL);
|
|
96
|
+
return;
|
|
97
|
+
}
|
|
98
|
+
const int16_t *samples = (const int16_t *)inBuffer->mAudioData;
|
|
99
|
+
NSUInteger count = byteSize / sizeof(int16_t);
|
|
100
|
+
NSInteger targetRate = _pcmLiveTargetSampleRate;
|
|
101
|
+
NSInteger captureRate = _pcmLiveCaptureRate;
|
|
102
|
+
|
|
103
|
+
if (captureRate == targetRate) {
|
|
104
|
+
emitPcmChunk(module, samples, count, targetRate);
|
|
105
|
+
} else {
|
|
106
|
+
// Compute an upper bound on the number of output frames for resampling.
|
|
107
|
+
NSUInteger maxOutFrames =
|
|
108
|
+
(count * (NSUInteger)targetRate + (NSUInteger)captureRate - 1) /
|
|
109
|
+
(NSUInteger)captureRate;
|
|
110
|
+
if (maxOutFrames == 0) {
|
|
111
|
+
AudioQueueEnqueueBuffer(inAQ, inBuffer, 0, NULL);
|
|
112
|
+
return;
|
|
113
|
+
}
|
|
114
|
+
int16_t *resampleBuf = (int16_t *)malloc(maxOutFrames * sizeof(int16_t));
|
|
115
|
+
if (resampleBuf == NULL) {
|
|
116
|
+
emitPcmError(module, @"Failed to allocate resample buffer");
|
|
117
|
+
AudioQueueEnqueueBuffer(inAQ, inBuffer, 0, NULL);
|
|
118
|
+
return;
|
|
119
|
+
}
|
|
120
|
+
NSUInteger outFrames = pcmLiveResampleInt16(samples, count,
|
|
121
|
+
(int)captureRate, (int)targetRate,
|
|
122
|
+
resampleBuf, maxOutFrames);
|
|
123
|
+
if (outFrames > 0)
|
|
124
|
+
emitPcmChunk(module, resampleBuf, outFrames, targetRate);
|
|
125
|
+
free(resampleBuf);
|
|
126
|
+
}
|
|
127
|
+
AudioQueueEnqueueBuffer(inAQ, inBuffer, 0, NULL);
|
|
128
|
+
}
|
|
129
|
+
|
|
130
|
+
static void pcmLiveStopQueue(void) {
|
|
131
|
+
if (_pcmLiveAudioQueue == NULL) return;
|
|
132
|
+
_pcmLiveAQRunning = NO;
|
|
133
|
+
AudioQueueStop(_pcmLiveAudioQueue, true);
|
|
134
|
+
for (UInt32 i = 0; i < kPcmLiveAQNumberBuffers; i++) {
|
|
135
|
+
if (_pcmLiveAQBuffers[i] != NULL) {
|
|
136
|
+
AudioQueueFreeBuffer(_pcmLiveAudioQueue, _pcmLiveAQBuffers[i]);
|
|
137
|
+
_pcmLiveAQBuffers[i] = NULL;
|
|
138
|
+
}
|
|
139
|
+
}
|
|
140
|
+
AudioQueueDispose(_pcmLiveAudioQueue, true);
|
|
141
|
+
_pcmLiveAudioQueue = NULL;
|
|
142
|
+
}
|
|
143
|
+
|
|
144
|
+
@implementation SherpaOnnx (PcmLiveStream)
|
|
145
|
+
|
|
146
|
+
- (void)startPcmLiveStream:(id __unsafe_unretained)optionsArg
|
|
147
|
+
resolve:(RCTPromiseResolveBlock)resolve
|
|
148
|
+
reject:(RCTPromiseRejectBlock)reject
|
|
149
|
+
{
|
|
150
|
+
int targetRate = 16000;
|
|
151
|
+
UInt32 bufferSizeFrames = 0;
|
|
152
|
+
|
|
153
|
+
// Parse optionsArg coming from JS (fallback / non-codegen path).
|
|
154
|
+
if ([optionsArg isKindOfClass:[NSDictionary class]]) {
|
|
155
|
+
NSDictionary *dict = (NSDictionary *)optionsArg;
|
|
156
|
+
|
|
157
|
+
id sampleRateValue = dict[@"sampleRate"];
|
|
158
|
+
if ([sampleRateValue respondsToSelector:@selector(intValue)]) {
|
|
159
|
+
int v = (int)[sampleRateValue intValue];
|
|
160
|
+
if (v > 0) targetRate = v;
|
|
161
|
+
}
|
|
162
|
+
|
|
163
|
+
id bufferSizeValue = dict[@"bufferSizeFrames"];
|
|
164
|
+
if ([bufferSizeValue respondsToSelector:@selector(doubleValue)]) {
|
|
165
|
+
double v = [bufferSizeValue doubleValue];
|
|
166
|
+
if (v > 0) bufferSizeFrames = (UInt32)v;
|
|
167
|
+
}
|
|
168
|
+
}
|
|
169
|
+
|
|
170
|
+
[self _startPcmLiveStreamWithTargetRate:targetRate bufferSizeFrames:bufferSizeFrames resolve:resolve reject:reject];
|
|
171
|
+
}
|
|
172
|
+
|
|
173
|
+
#if __has_include(<SherpaOnnxSpec/SherpaOnnxSpec.h>)
|
|
174
|
+
- (void)startPcmLiveStreamWithOptions:(JS::NativeSherpaOnnx::SpecStartPcmLiveStreamOptions &)options
|
|
175
|
+
resolve:(RCTPromiseResolveBlock)resolve
|
|
176
|
+
reject:(RCTPromiseRejectBlock)reject
|
|
177
|
+
{
|
|
178
|
+
int targetRate = 16000;
|
|
179
|
+
if (options.sampleRate()) {
|
|
180
|
+
targetRate = (int)options.sampleRate();
|
|
181
|
+
if (targetRate <= 0) targetRate = 16000;
|
|
182
|
+
}
|
|
183
|
+
UInt32 bufferSizeFrames = 0;
|
|
184
|
+
if (options.bufferSizeFrames().has_value()) {
|
|
185
|
+
double v = options.bufferSizeFrames().value();
|
|
186
|
+
if (v > 0) bufferSizeFrames = (UInt32)v;
|
|
187
|
+
}
|
|
188
|
+
[self _startPcmLiveStreamWithTargetRate:targetRate bufferSizeFrames:bufferSizeFrames resolve:resolve reject:reject];
|
|
189
|
+
}
|
|
190
|
+
#endif
|
|
191
|
+
|
|
192
|
+
- (void)_startPcmLiveStreamWithTargetRate:(int)targetRate
|
|
193
|
+
bufferSizeFrames:(UInt32)bufferSizeFrames
|
|
194
|
+
resolve:(RCTPromiseResolveBlock)resolve
|
|
195
|
+
reject:(RCTPromiseRejectBlock)reject
|
|
196
|
+
{
|
|
197
|
+
pcmLiveStopQueue();
|
|
198
|
+
|
|
199
|
+
_pcmLiveTargetSampleRate = targetRate;
|
|
200
|
+
_pcmLiveModule = self;
|
|
201
|
+
|
|
202
|
+
NSError *error = nil;
|
|
203
|
+
AVAudioSession *session = [AVAudioSession sharedInstance];
|
|
204
|
+
if (![session setCategory:AVAudioSessionCategoryPlayAndRecord
|
|
205
|
+
mode:AVAudioSessionModeDefault
|
|
206
|
+
options:AVAudioSessionCategoryOptionDefaultToSpeaker | AVAudioSessionCategoryOptionAllowBluetooth
|
|
207
|
+
error:&error]) {
|
|
208
|
+
RCTLog(@"%@", [NSString stringWithFormat:@"[SherpaOnnx PcmLive] setCategory error: %@", error]);
|
|
209
|
+
reject(@"PCM_LIVE_STREAM_ERROR", error.localizedDescription ?: @"Failed to set audio session", error);
|
|
210
|
+
return;
|
|
211
|
+
}
|
|
212
|
+
if (![session setActive:YES withOptions:0 error:&error]) {
|
|
213
|
+
RCTLog(@"%@", [NSString stringWithFormat:@"[SherpaOnnx PcmLive] setActive error: %@", error]);
|
|
214
|
+
reject(@"PCM_LIVE_STREAM_ERROR", error.localizedDescription ?: @"Failed to activate audio session", error);
|
|
215
|
+
return;
|
|
216
|
+
}
|
|
217
|
+
|
|
218
|
+
AudioStreamBasicDescription fmt;
|
|
219
|
+
memset(&fmt, 0, sizeof(fmt));
|
|
220
|
+
fmt.mFormatID = kAudioFormatLinearPCM;
|
|
221
|
+
fmt.mFormatFlags = kLinearPCMFormatFlagIsSignedInteger | kLinearPCMFormatFlagIsPacked;
|
|
222
|
+
fmt.mChannelsPerFrame = 1;
|
|
223
|
+
fmt.mBitsPerChannel = 16;
|
|
224
|
+
fmt.mBytesPerPacket = 2;
|
|
225
|
+
fmt.mBytesPerFrame = 2;
|
|
226
|
+
fmt.mFramesPerPacket = 1;
|
|
227
|
+
|
|
228
|
+
OSStatus status = noErr;
|
|
229
|
+
int chosenCaptureRate = 16000;
|
|
230
|
+
for (size_t r = 0; r < kPcmLiveCaptureRatesCount; r++) {
|
|
231
|
+
chosenCaptureRate = kPcmLiveCaptureRates[r];
|
|
232
|
+
fmt.mSampleRate = (Float64)chosenCaptureRate;
|
|
233
|
+
status = AudioQueueNewInput(&fmt, pcmLiveAQInputCallback, NULL, NULL, NULL, 0, &_pcmLiveAudioQueue);
|
|
234
|
+
if (status == noErr) break;
|
|
235
|
+
_pcmLiveAudioQueue = NULL;
|
|
236
|
+
}
|
|
237
|
+
if (status != noErr || _pcmLiveAudioQueue == NULL) {
|
|
238
|
+
[session setActive:NO withOptions:0 error:nil];
|
|
239
|
+
reject(@"PCM_LIVE_STREAM_ERROR", [NSString stringWithFormat:@"AudioQueueNewInput failed for all rates (last: %d)", (int)status], nil);
|
|
240
|
+
return;
|
|
241
|
+
}
|
|
242
|
+
_pcmLiveCaptureRate = chosenCaptureRate;
|
|
243
|
+
|
|
244
|
+
UInt32 bufferByteSize = 2048;
|
|
245
|
+
if (bufferSizeFrames > 0) {
|
|
246
|
+
bufferByteSize = bufferSizeFrames * 2; /* 16-bit mono */
|
|
247
|
+
if (bufferByteSize < 1024) bufferByteSize = 1024;
|
|
248
|
+
if (bufferByteSize > 32768) bufferByteSize = 32768;
|
|
249
|
+
}
|
|
250
|
+
|
|
251
|
+
for (UInt32 i = 0; i < kPcmLiveAQNumberBuffers; i++) {
|
|
252
|
+
status = AudioQueueAllocateBuffer(_pcmLiveAudioQueue, bufferByteSize, &_pcmLiveAQBuffers[i]);
|
|
253
|
+
if (status != noErr) {
|
|
254
|
+
pcmLiveStopQueue();
|
|
255
|
+
[session setActive:NO withOptions:0 error:nil];
|
|
256
|
+
reject(@"PCM_LIVE_STREAM_ERROR", [NSString stringWithFormat:@"AudioQueueAllocateBuffer failed: %d", (int)status], nil);
|
|
257
|
+
return;
|
|
258
|
+
}
|
|
259
|
+
AudioQueueEnqueueBuffer(_pcmLiveAudioQueue, _pcmLiveAQBuffers[i], 0, NULL);
|
|
260
|
+
}
|
|
261
|
+
|
|
262
|
+
_pcmLiveAQRunning = YES;
|
|
263
|
+
status = AudioQueueStart(_pcmLiveAudioQueue, NULL);
|
|
264
|
+
if (status != noErr) {
|
|
265
|
+
pcmLiveStopQueue();
|
|
266
|
+
[session setActive:NO withOptions:0 error:nil];
|
|
267
|
+
reject(@"PCM_LIVE_STREAM_ERROR", [NSString stringWithFormat:@"AudioQueueStart failed: %d", (int)status], nil);
|
|
268
|
+
return;
|
|
269
|
+
}
|
|
270
|
+
|
|
271
|
+
resolve(nil);
|
|
272
|
+
}
|
|
273
|
+
|
|
274
|
+
- (void)stopPcmLiveStream:(RCTPromiseResolveBlock)resolve
|
|
275
|
+
reject:(RCTPromiseRejectBlock)reject
|
|
276
|
+
{
|
|
277
|
+
[self stopPcmLiveStreamWithResolve:resolve reject:reject];
|
|
278
|
+
}
|
|
279
|
+
|
|
280
|
+
- (void)stopPcmLiveStreamWithResolve:(RCTPromiseResolveBlock)resolve
|
|
281
|
+
reject:(RCTPromiseRejectBlock)reject
|
|
282
|
+
{
|
|
283
|
+
pcmLiveStopQueue();
|
|
284
|
+
[[AVAudioSession sharedInstance] setActive:NO withOptions:AVAudioSessionSetActiveOptionNotifyOthersOnDeactivation error:nil];
|
|
285
|
+
resolve(nil);
|
|
286
|
+
}
|
|
287
|
+
|
|
288
|
+
@end
|
package/ios/SherpaOnnx+STT.mm
CHANGED
|
@@ -38,6 +38,7 @@ static NSString *sttModelKindToNSString(sherpaonnx::SttModelKind kind) {
|
|
|
38
38
|
case K::kFunAsrNano: return @"funasr_nano";
|
|
39
39
|
case K::kFireRedAsr: return @"fire_red_asr";
|
|
40
40
|
case K::kMoonshine: return @"moonshine";
|
|
41
|
+
case K::kMoonshineV2: return @"moonshine_v2";
|
|
41
42
|
case K::kDolphin: return @"dolphin";
|
|
42
43
|
case K::kCanary: return @"canary";
|
|
43
44
|
case K::kOmnilingual: return @"omnilingual";
|
|
@@ -268,6 +269,7 @@ static NSDictionary *sttResultToDict(const sherpaonnx::SttRecognitionResult& r)
|
|
|
268
269
|
|
|
269
270
|
NSMutableDictionary *resultDict = [NSMutableDictionary dictionary];
|
|
270
271
|
resultDict[@"success"] = @(result.ok);
|
|
272
|
+
resultDict[@"isHardwareSpecificUnsupported"] = @(result.isHardwareSpecificUnsupported);
|
|
271
273
|
if (!result.error.empty()) {
|
|
272
274
|
resultDict[@"error"] = [NSString stringWithUTF8String:result.error.c_str()];
|
|
273
275
|
}
|
package/ios/SherpaOnnx.mm
CHANGED
|
@@ -36,7 +36,7 @@
|
|
|
36
36
|
|
|
37
37
|
- (NSArray<NSString *> *)supportedEvents
|
|
38
38
|
{
|
|
39
|
-
return @[ @"ttsStreamChunk", @"ttsStreamEnd", @"ttsStreamError", @"extractTarBz2Progress" ];
|
|
39
|
+
return @[ @"ttsStreamChunk", @"ttsStreamEnd", @"ttsStreamError", @"extractTarBz2Progress", @"pcmLiveStreamData", @"pcmLiveStreamError" ];
|
|
40
40
|
}
|
|
41
41
|
|
|
42
42
|
- (void)resolveModelPath:(JS::NativeSherpaOnnx::SpecResolveModelPathConfig &)config
|
|
@@ -21,12 +21,11 @@ std::vector<std::string> ListDirectories(const std::string& path);
|
|
|
21
21
|
std::vector<FileEntry> ListFiles(const std::string& path);
|
|
22
22
|
std::vector<FileEntry> ListFilesRecursive(const std::string& path, int maxDepth);
|
|
23
23
|
std::string ToLower(std::string value);
|
|
24
|
-
std::string ResolveTokenizerDir(const std::string& modelDir);
|
|
25
24
|
|
|
26
|
-
|
|
25
|
+
/** Find file in \p files whose name equals \p fileName (case-insensitive). Uses file tree only, no filesystem. */
|
|
26
|
+
std::string FindFileByName(const std::vector<FileEntry>& files, const std::string& fileName);
|
|
27
27
|
/** Find file whose name equals or ends with suffix (e.g. tokens.txt, tiny-tokens.txt) in a pre-built file list. */
|
|
28
28
|
std::string FindFileEndingWith(const std::vector<FileEntry>& files, const std::string& suffix);
|
|
29
|
-
std::string FindDirectoryByName(const std::string& baseDir, const std::string& dirName, int maxDepth);
|
|
30
29
|
|
|
31
30
|
std::string FindOnnxByToken(
|
|
32
31
|
const std::vector<FileEntry>& files,
|
|
@@ -38,6 +37,13 @@ std::string FindOnnxByAnyToken(
|
|
|
38
37
|
const std::vector<std::string>& tokens,
|
|
39
38
|
const std::optional<bool>& preferInt8
|
|
40
39
|
);
|
|
40
|
+
/** Like FindOnnxByAnyToken but skips any file whose nameLower contains any of \p excludeInName. */
|
|
41
|
+
std::string FindOnnxByAnyTokenExcluding(
|
|
42
|
+
const std::vector<FileEntry>& files,
|
|
43
|
+
const std::vector<std::string>& tokens,
|
|
44
|
+
const std::vector<std::string>& excludeInName,
|
|
45
|
+
const std::optional<bool>& preferInt8
|
|
46
|
+
);
|
|
41
47
|
std::string FindLargestOnnxExcludingTokens(
|
|
42
48
|
const std::vector<FileEntry>& files,
|
|
43
49
|
const std::vector<std::string>& excludeTokens
|