react-native-sherpa-onnx 0.3.0 → 0.3.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +21 -7
- package/SherpaOnnx.podspec +1 -1
- package/android/build.gradle +35 -26
- package/android/prebuilt-download.gradle +27 -14
- package/android/src/main/cpp/CMakeLists.txt +51 -17
- package/android/src/main/cpp/jni/archive/sherpa-onnx-archive-helper.cpp +14 -0
- package/android/src/main/cpp/jni/model_detect/sherpa-onnx-model-detect-helper.cpp +16 -0
- package/android/src/main/cpp/jni/model_detect/sherpa-onnx-model-detect-helper.h +3 -0
- package/android/src/main/cpp/jni/model_detect/sherpa-onnx-model-detect-stt.cpp +19 -2
- package/android/src/main/cpp/jni/model_detect/sherpa-onnx-model-detect.h +2 -1
- package/android/src/main/cpp/jni/model_detect/sherpa-onnx-stt-wrapper.cpp +1 -0
- package/android/src/main/java/com/sherpaonnx/SherpaOnnxModule.kt +114 -8
- package/android/src/main/java/com/sherpaonnx/SherpaOnnxOnlineSttHelper.kt +535 -0
- package/android/src/main/java/com/sherpaonnx/SherpaOnnxTtsHelper.kt +10 -10
- package/ios/SherpaOnnx+OnlineSTT.mm +365 -0
- package/ios/SherpaOnnx+TTS.mm +35 -9
- package/ios/SherpaOnnx.mm +6 -0
- package/ios/model_detect/sherpa-onnx-model-detect-helper.h +3 -0
- package/ios/model_detect/sherpa-onnx-model-detect-helper.mm +16 -0
- package/ios/model_detect/sherpa-onnx-model-detect-stt.mm +19 -2
- package/ios/model_detect/sherpa-onnx-model-detect.h +2 -1
- package/ios/online_stt/sherpa-onnx-online-stt-wrapper.h +85 -0
- package/ios/online_stt/sherpa-onnx-online-stt-wrapper.mm +270 -0
- package/lib/module/NativeSherpaOnnx.js.map +1 -1
- package/lib/module/index.js +2 -2
- package/lib/module/stt/index.js +4 -0
- package/lib/module/stt/index.js.map +1 -1
- package/lib/module/stt/streaming.js +257 -0
- package/lib/module/stt/streaming.js.map +1 -0
- package/lib/module/stt/streamingTypes.js +38 -0
- package/lib/module/stt/streamingTypes.js.map +1 -0
- package/lib/module/tts/index.js +4 -43
- package/lib/module/tts/index.js.map +1 -1
- package/lib/module/tts/streaming.js +220 -0
- package/lib/module/tts/streaming.js.map +1 -0
- package/lib/module/tts/streamingTypes.js +4 -0
- package/lib/module/tts/streamingTypes.js.map +1 -0
- package/lib/module/tts/types.js +8 -1
- package/lib/module/tts/types.js.map +1 -1
- package/lib/typescript/src/NativeSherpaOnnx.d.ts +66 -1
- package/lib/typescript/src/NativeSherpaOnnx.d.ts.map +1 -1
- package/lib/typescript/src/stt/index.d.ts +3 -0
- package/lib/typescript/src/stt/index.d.ts.map +1 -1
- package/lib/typescript/src/stt/streaming.d.ts +42 -0
- package/lib/typescript/src/stt/streaming.d.ts.map +1 -0
- package/lib/typescript/src/stt/streamingTypes.d.ts +122 -0
- package/lib/typescript/src/stt/streamingTypes.d.ts.map +1 -0
- package/lib/typescript/src/tts/index.d.ts +3 -1
- package/lib/typescript/src/tts/index.d.ts.map +1 -1
- package/lib/typescript/src/tts/streaming.d.ts +24 -0
- package/lib/typescript/src/tts/streaming.d.ts.map +1 -0
- package/lib/typescript/src/tts/streamingTypes.d.ts +27 -0
- package/lib/typescript/src/tts/streamingTypes.d.ts.map +1 -0
- package/lib/typescript/src/tts/types.d.ts +19 -6
- package/lib/typescript/src/tts/types.d.ts.map +1 -1
- package/package.json +1 -2
- package/src/NativeSherpaOnnx.ts +95 -0
- package/src/index.tsx +2 -2
- package/src/stt/index.ts +17 -0
- package/src/stt/streaming.ts +361 -0
- package/src/stt/streamingTypes.ts +151 -0
- package/src/tts/index.ts +6 -66
- package/src/tts/streaming.ts +336 -0
- package/src/tts/streamingTypes.ts +54 -0
- package/src/tts/types.ts +20 -10
- package/android/codegen.gradle +0 -57
|
@@ -0,0 +1,361 @@
|
|
|
1
|
+
import SherpaOnnx from '../NativeSherpaOnnx';
|
|
2
|
+
import { resolveModelPath } from '../utils';
|
|
3
|
+
import type {
|
|
4
|
+
OnlineSTTModelType,
|
|
5
|
+
StreamingSttEngine,
|
|
6
|
+
StreamingSttInitOptions,
|
|
7
|
+
StreamingSttResult,
|
|
8
|
+
SttStream,
|
|
9
|
+
} from './streamingTypes';
|
|
10
|
+
|
|
11
|
+
let streamingSttInstanceCounter = 0;
|
|
12
|
+
|
|
13
|
+
/**
|
|
14
|
+
* Map detected STT model type (from detectSttModel) to an online (streaming) model type.
|
|
15
|
+
* Throws if the detected type has no streaming support.
|
|
16
|
+
*/
|
|
17
|
+
export function mapDetectedToOnlineType(
|
|
18
|
+
detectedType: string | undefined
|
|
19
|
+
): OnlineSTTModelType {
|
|
20
|
+
const t = detectedType ?? '';
|
|
21
|
+
switch (t) {
|
|
22
|
+
case 'transducer':
|
|
23
|
+
return 'transducer';
|
|
24
|
+
case 'paraformer':
|
|
25
|
+
return 'paraformer';
|
|
26
|
+
case 'nemo_ctc':
|
|
27
|
+
return 'nemo_ctc';
|
|
28
|
+
case 'zipformer_ctc':
|
|
29
|
+
case 'ctc':
|
|
30
|
+
return 'zipformer2_ctc';
|
|
31
|
+
case 'tone_ctc':
|
|
32
|
+
return 'tone_ctc';
|
|
33
|
+
default:
|
|
34
|
+
throw new Error(
|
|
35
|
+
`Model type "${t}" is not supported for streaming STT. Use createSTT() for offline recognition, or pass a supported modelType: transducer, paraformer, zipformer2_ctc, nemo_ctc, tone_ctc.`
|
|
36
|
+
);
|
|
37
|
+
}
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
/**
|
|
41
|
+
* Returns the online (streaming) model type for a detected STT model type, or null if streaming is not supported.
|
|
42
|
+
* Use this to check whether the current model can be used with createStreamingSTT() (e.g. for live transcription).
|
|
43
|
+
*/
|
|
44
|
+
export function getOnlineTypeOrNull(
|
|
45
|
+
detectedType: string | undefined
|
|
46
|
+
): OnlineSTTModelType | null {
|
|
47
|
+
try {
|
|
48
|
+
return mapDetectedToOnlineType(detectedType);
|
|
49
|
+
} catch {
|
|
50
|
+
return null;
|
|
51
|
+
}
|
|
52
|
+
}
|
|
53
|
+
let sttStreamCounter = 0;
|
|
54
|
+
|
|
55
|
+
function normalizeStreamingResult(raw: {
|
|
56
|
+
text?: string;
|
|
57
|
+
tokens?: string[] | unknown;
|
|
58
|
+
timestamps?: number[] | unknown;
|
|
59
|
+
}): StreamingSttResult {
|
|
60
|
+
return {
|
|
61
|
+
text: typeof raw.text === 'string' ? raw.text : '',
|
|
62
|
+
tokens: Array.isArray(raw.tokens) ? (raw.tokens as string[]) : [],
|
|
63
|
+
timestamps: Array.isArray(raw.timestamps)
|
|
64
|
+
? (raw.timestamps as number[])
|
|
65
|
+
: [],
|
|
66
|
+
};
|
|
67
|
+
}
|
|
68
|
+
|
|
69
|
+
/**
|
|
70
|
+
* Flatten StreamingSttInitOptions to native initializeOnlineStt parameters.
|
|
71
|
+
* EndpointConfig (rule1, rule2, rule3) is expanded to 9 flat params.
|
|
72
|
+
*/
|
|
73
|
+
function flattenInitOptionsForNative(options: StreamingSttInitOptions): {
|
|
74
|
+
modelDir: string;
|
|
75
|
+
modelType: string;
|
|
76
|
+
enableEndpoint: boolean;
|
|
77
|
+
decodingMethod: string;
|
|
78
|
+
maxActivePaths: number;
|
|
79
|
+
hotwordsFile?: string;
|
|
80
|
+
hotwordsScore?: number;
|
|
81
|
+
numThreads?: number;
|
|
82
|
+
provider?: string;
|
|
83
|
+
ruleFsts?: string;
|
|
84
|
+
ruleFars?: string;
|
|
85
|
+
blankPenalty?: number;
|
|
86
|
+
debug?: boolean;
|
|
87
|
+
rule1MustContainNonSilence?: boolean;
|
|
88
|
+
rule1MinTrailingSilence?: number;
|
|
89
|
+
rule1MinUtteranceLength?: number;
|
|
90
|
+
rule2MustContainNonSilence?: boolean;
|
|
91
|
+
rule2MinTrailingSilence?: number;
|
|
92
|
+
rule2MinUtteranceLength?: number;
|
|
93
|
+
rule3MustContainNonSilence?: boolean;
|
|
94
|
+
rule3MinTrailingSilence?: number;
|
|
95
|
+
rule3MinUtteranceLength?: number;
|
|
96
|
+
} {
|
|
97
|
+
const ep = options.endpointConfig;
|
|
98
|
+
return {
|
|
99
|
+
modelDir: '', // filled by caller after resolveModelPath
|
|
100
|
+
modelType: options.modelType,
|
|
101
|
+
enableEndpoint: options.enableEndpoint ?? true,
|
|
102
|
+
decodingMethod: options.decodingMethod ?? 'greedy_search',
|
|
103
|
+
maxActivePaths: options.maxActivePaths ?? 4,
|
|
104
|
+
hotwordsFile: options.hotwordsFile,
|
|
105
|
+
hotwordsScore: options.hotwordsScore,
|
|
106
|
+
numThreads: options.numThreads,
|
|
107
|
+
provider: options.provider,
|
|
108
|
+
ruleFsts: options.ruleFsts,
|
|
109
|
+
ruleFars: options.ruleFars,
|
|
110
|
+
blankPenalty: options.blankPenalty,
|
|
111
|
+
debug: options.debug,
|
|
112
|
+
rule1MustContainNonSilence: ep?.rule1?.mustContainNonSilence,
|
|
113
|
+
rule1MinTrailingSilence: ep?.rule1?.minTrailingSilence,
|
|
114
|
+
rule1MinUtteranceLength: ep?.rule1?.minUtteranceLength,
|
|
115
|
+
rule2MustContainNonSilence: ep?.rule2?.mustContainNonSilence,
|
|
116
|
+
rule2MinTrailingSilence: ep?.rule2?.minTrailingSilence,
|
|
117
|
+
rule2MinUtteranceLength: ep?.rule2?.minUtteranceLength,
|
|
118
|
+
rule3MustContainNonSilence: ep?.rule3?.mustContainNonSilence,
|
|
119
|
+
rule3MinTrailingSilence: ep?.rule3?.minTrailingSilence,
|
|
120
|
+
rule3MinUtteranceLength: ep?.rule3?.minUtteranceLength,
|
|
121
|
+
};
|
|
122
|
+
}
|
|
123
|
+
|
|
124
|
+
/**
|
|
125
|
+
* Create a streaming (online) STT engine. Use this for real-time recognition with
|
|
126
|
+
* partial results and endpoint detection. Call destroy() when done.
|
|
127
|
+
*
|
|
128
|
+
* @param options - Streaming STT init options (modelPath required; modelType optional, use 'auto' to detect from directory)
|
|
129
|
+
* @returns Promise resolving to a StreamingSttEngine
|
|
130
|
+
* @example
|
|
131
|
+
* ```typescript
|
|
132
|
+
* // With explicit model type
|
|
133
|
+
* const engine = await createStreamingSTT({
|
|
134
|
+
* modelPath: { type: 'asset', path: 'models/streaming-zipformer-en' },
|
|
135
|
+
* modelType: 'transducer',
|
|
136
|
+
* });
|
|
137
|
+
* // With auto-detection
|
|
138
|
+
* const engine = await createStreamingSTT({
|
|
139
|
+
* modelPath: { type: 'asset', path: 'models/sherpa-onnx-streaming-t-one-russian-2025-09-08' },
|
|
140
|
+
* modelType: 'auto',
|
|
141
|
+
* });
|
|
142
|
+
* const stream = await engine.createStream();
|
|
143
|
+
* await stream.acceptWaveform(samples, 16000);
|
|
144
|
+
* if (await stream.isReady()) {
|
|
145
|
+
* await stream.decode();
|
|
146
|
+
* const result = await stream.getResult();
|
|
147
|
+
* console.log(result.text);
|
|
148
|
+
* }
|
|
149
|
+
* await stream.release();
|
|
150
|
+
* await engine.destroy();
|
|
151
|
+
* ```
|
|
152
|
+
*/
|
|
153
|
+
export async function createStreamingSTT(
|
|
154
|
+
options: StreamingSttInitOptions
|
|
155
|
+
): Promise<StreamingSttEngine> {
|
|
156
|
+
const instanceId = `streaming_stt_${++streamingSttInstanceCounter}`;
|
|
157
|
+
const resolvedPath = await resolveModelPath(options.modelPath);
|
|
158
|
+
|
|
159
|
+
let effectiveModelType: OnlineSTTModelType;
|
|
160
|
+
if (options.modelType === 'auto' || options.modelType === undefined) {
|
|
161
|
+
const detectResult = await SherpaOnnx.detectSttModel(
|
|
162
|
+
resolvedPath,
|
|
163
|
+
undefined,
|
|
164
|
+
undefined
|
|
165
|
+
);
|
|
166
|
+
if (!detectResult.success) {
|
|
167
|
+
const errMsg =
|
|
168
|
+
'error' in detectResult &&
|
|
169
|
+
typeof (detectResult as { error?: string }).error === 'string'
|
|
170
|
+
? (detectResult as { error: string }).error
|
|
171
|
+
: 'Unknown error';
|
|
172
|
+
throw new Error(
|
|
173
|
+
`Streaming STT auto-detection failed for ${resolvedPath}. ${errMsg}`
|
|
174
|
+
);
|
|
175
|
+
}
|
|
176
|
+
effectiveModelType = mapDetectedToOnlineType(detectResult.modelType);
|
|
177
|
+
} else {
|
|
178
|
+
effectiveModelType = options.modelType;
|
|
179
|
+
}
|
|
180
|
+
|
|
181
|
+
const optionsWithResolvedType = { ...options, modelType: effectiveModelType };
|
|
182
|
+
const flat = flattenInitOptionsForNative(optionsWithResolvedType);
|
|
183
|
+
flat.modelDir = resolvedPath;
|
|
184
|
+
|
|
185
|
+
// Build options with only defined values (no undefined) to avoid iOS TurboModule marshalling crash when options contain undefined.
|
|
186
|
+
const nativeOptions: Parameters<
|
|
187
|
+
typeof SherpaOnnx.initializeOnlineSttWithOptions
|
|
188
|
+
>[1] = {
|
|
189
|
+
modelDir: flat.modelDir,
|
|
190
|
+
modelType: flat.modelType,
|
|
191
|
+
enableEndpoint: flat.enableEndpoint,
|
|
192
|
+
decodingMethod: flat.decodingMethod,
|
|
193
|
+
maxActivePaths: flat.maxActivePaths,
|
|
194
|
+
};
|
|
195
|
+
if (flat.hotwordsFile !== undefined)
|
|
196
|
+
nativeOptions.hotwordsFile = flat.hotwordsFile;
|
|
197
|
+
if (flat.hotwordsScore !== undefined)
|
|
198
|
+
nativeOptions.hotwordsScore = flat.hotwordsScore;
|
|
199
|
+
if (flat.numThreads !== undefined) nativeOptions.numThreads = flat.numThreads;
|
|
200
|
+
if (flat.provider !== undefined) nativeOptions.provider = flat.provider;
|
|
201
|
+
if (flat.ruleFsts !== undefined) nativeOptions.ruleFsts = flat.ruleFsts;
|
|
202
|
+
if (flat.ruleFars !== undefined) nativeOptions.ruleFars = flat.ruleFars;
|
|
203
|
+
if (flat.blankPenalty !== undefined)
|
|
204
|
+
nativeOptions.blankPenalty = flat.blankPenalty;
|
|
205
|
+
if (flat.debug !== undefined) nativeOptions.debug = flat.debug;
|
|
206
|
+
if (flat.rule1MustContainNonSilence !== undefined)
|
|
207
|
+
nativeOptions.rule1MustContainNonSilence = flat.rule1MustContainNonSilence;
|
|
208
|
+
if (flat.rule1MinTrailingSilence !== undefined)
|
|
209
|
+
nativeOptions.rule1MinTrailingSilence = flat.rule1MinTrailingSilence;
|
|
210
|
+
if (flat.rule1MinUtteranceLength !== undefined)
|
|
211
|
+
nativeOptions.rule1MinUtteranceLength = flat.rule1MinUtteranceLength;
|
|
212
|
+
if (flat.rule2MustContainNonSilence !== undefined)
|
|
213
|
+
nativeOptions.rule2MustContainNonSilence = flat.rule2MustContainNonSilence;
|
|
214
|
+
if (flat.rule2MinTrailingSilence !== undefined)
|
|
215
|
+
nativeOptions.rule2MinTrailingSilence = flat.rule2MinTrailingSilence;
|
|
216
|
+
if (flat.rule2MinUtteranceLength !== undefined)
|
|
217
|
+
nativeOptions.rule2MinUtteranceLength = flat.rule2MinUtteranceLength;
|
|
218
|
+
if (flat.rule3MustContainNonSilence !== undefined)
|
|
219
|
+
nativeOptions.rule3MustContainNonSilence = flat.rule3MustContainNonSilence;
|
|
220
|
+
if (flat.rule3MinTrailingSilence !== undefined)
|
|
221
|
+
nativeOptions.rule3MinTrailingSilence = flat.rule3MinTrailingSilence;
|
|
222
|
+
if (flat.rule3MinUtteranceLength !== undefined)
|
|
223
|
+
nativeOptions.rule3MinUtteranceLength = flat.rule3MinUtteranceLength;
|
|
224
|
+
|
|
225
|
+
const result = await SherpaOnnx.initializeOnlineSttWithOptions(
|
|
226
|
+
instanceId,
|
|
227
|
+
nativeOptions
|
|
228
|
+
);
|
|
229
|
+
|
|
230
|
+
if (!result.success) {
|
|
231
|
+
throw new Error(`Streaming STT initialization failed for ${instanceId}`);
|
|
232
|
+
}
|
|
233
|
+
|
|
234
|
+
const enableInputNormalization = options.enableInputNormalization !== false;
|
|
235
|
+
let destroyed = false;
|
|
236
|
+
|
|
237
|
+
const guard = () => {
|
|
238
|
+
if (destroyed) {
|
|
239
|
+
throw new Error(
|
|
240
|
+
`Streaming STT engine ${instanceId} has been destroyed; cannot call methods on it.`
|
|
241
|
+
);
|
|
242
|
+
}
|
|
243
|
+
};
|
|
244
|
+
|
|
245
|
+
const engine: StreamingSttEngine = {
|
|
246
|
+
get instanceId() {
|
|
247
|
+
return instanceId;
|
|
248
|
+
},
|
|
249
|
+
|
|
250
|
+
async createStream(hotwords?: string): Promise<SttStream> {
|
|
251
|
+
guard();
|
|
252
|
+
const streamId = `stt_stream_${++sttStreamCounter}`;
|
|
253
|
+
await SherpaOnnx.createSttStream(instanceId, streamId, hotwords);
|
|
254
|
+
|
|
255
|
+
let released = false;
|
|
256
|
+
const streamGuard = () => {
|
|
257
|
+
if (destroyed) {
|
|
258
|
+
throw new Error(
|
|
259
|
+
`Streaming STT engine ${instanceId} has been destroyed.`
|
|
260
|
+
);
|
|
261
|
+
}
|
|
262
|
+
if (released) {
|
|
263
|
+
throw new Error(
|
|
264
|
+
`Stream ${streamId} has been released; cannot call methods on it.`
|
|
265
|
+
);
|
|
266
|
+
}
|
|
267
|
+
};
|
|
268
|
+
|
|
269
|
+
const stream: SttStream = {
|
|
270
|
+
get streamId() {
|
|
271
|
+
return streamId;
|
|
272
|
+
},
|
|
273
|
+
|
|
274
|
+
async acceptWaveform(
|
|
275
|
+
samples: number[],
|
|
276
|
+
sampleRate: number
|
|
277
|
+
): Promise<void> {
|
|
278
|
+
streamGuard();
|
|
279
|
+
await SherpaOnnx.acceptSttWaveform(streamId, samples, sampleRate);
|
|
280
|
+
},
|
|
281
|
+
|
|
282
|
+
async inputFinished(): Promise<void> {
|
|
283
|
+
streamGuard();
|
|
284
|
+
await SherpaOnnx.sttStreamInputFinished(streamId);
|
|
285
|
+
},
|
|
286
|
+
|
|
287
|
+
async decode(): Promise<void> {
|
|
288
|
+
streamGuard();
|
|
289
|
+
await SherpaOnnx.decodeSttStream(streamId);
|
|
290
|
+
},
|
|
291
|
+
|
|
292
|
+
async isReady(): Promise<boolean> {
|
|
293
|
+
streamGuard();
|
|
294
|
+
return SherpaOnnx.isSttStreamReady(streamId);
|
|
295
|
+
},
|
|
296
|
+
|
|
297
|
+
async getResult(): Promise<StreamingSttResult> {
|
|
298
|
+
streamGuard();
|
|
299
|
+
const raw = await SherpaOnnx.getSttStreamResult(streamId);
|
|
300
|
+
return normalizeStreamingResult(raw);
|
|
301
|
+
},
|
|
302
|
+
|
|
303
|
+
async isEndpoint(): Promise<boolean> {
|
|
304
|
+
streamGuard();
|
|
305
|
+
return SherpaOnnx.isSttStreamEndpoint(streamId);
|
|
306
|
+
},
|
|
307
|
+
|
|
308
|
+
async reset(): Promise<void> {
|
|
309
|
+
streamGuard();
|
|
310
|
+
await SherpaOnnx.resetSttStream(streamId);
|
|
311
|
+
},
|
|
312
|
+
|
|
313
|
+
async release(): Promise<void> {
|
|
314
|
+
if (released) return;
|
|
315
|
+
released = true;
|
|
316
|
+
await SherpaOnnx.releaseSttStream(streamId);
|
|
317
|
+
},
|
|
318
|
+
|
|
319
|
+
async processAudioChunk(
|
|
320
|
+
samples: number[],
|
|
321
|
+
sampleRate: number
|
|
322
|
+
): Promise<{ result: StreamingSttResult; isEndpoint: boolean }> {
|
|
323
|
+
streamGuard();
|
|
324
|
+
let toSend: number[] = samples;
|
|
325
|
+
if (enableInputNormalization && samples.length > 0) {
|
|
326
|
+
let maxAbs = 1e-10;
|
|
327
|
+
for (let i = 0; i < samples.length; i++) {
|
|
328
|
+
const abs = Math.abs(samples[i]!);
|
|
329
|
+
if (abs > maxAbs) maxAbs = abs;
|
|
330
|
+
}
|
|
331
|
+
const scale = maxAbs < 0.01 ? 80 : Math.min(80, 0.8 / maxAbs);
|
|
332
|
+
toSend = new Array(samples.length);
|
|
333
|
+
for (let i = 0; i < samples.length; i++) {
|
|
334
|
+
const v = samples[i]! * scale;
|
|
335
|
+
toSend[i] = v < -1 ? -1 : v > 1 ? 1 : v;
|
|
336
|
+
}
|
|
337
|
+
}
|
|
338
|
+
const raw = await SherpaOnnx.processSttAudioChunk(
|
|
339
|
+
streamId,
|
|
340
|
+
toSend,
|
|
341
|
+
sampleRate
|
|
342
|
+
);
|
|
343
|
+
return {
|
|
344
|
+
result: normalizeStreamingResult(raw),
|
|
345
|
+
isEndpoint: Boolean(raw.isEndpoint),
|
|
346
|
+
};
|
|
347
|
+
},
|
|
348
|
+
};
|
|
349
|
+
|
|
350
|
+
return stream;
|
|
351
|
+
},
|
|
352
|
+
|
|
353
|
+
async destroy(): Promise<void> {
|
|
354
|
+
if (destroyed) return;
|
|
355
|
+
destroyed = true;
|
|
356
|
+
await SherpaOnnx.unloadOnlineStt(instanceId);
|
|
357
|
+
},
|
|
358
|
+
};
|
|
359
|
+
|
|
360
|
+
return engine;
|
|
361
|
+
}
|
|
@@ -0,0 +1,151 @@
|
|
|
1
|
+
import type { ModelPathConfig } from '../types';
|
|
2
|
+
|
|
3
|
+
/**
|
|
4
|
+
* Online (streaming) STT model types.
|
|
5
|
+
* These models use OnlineRecognizer + OnlineStream in sherpa-onnx.
|
|
6
|
+
* Must match the native OnlineRecognizer model config (transducer, paraformer, zipformer2_ctc, nemo_ctc, tone_ctc).
|
|
7
|
+
*/
|
|
8
|
+
export type OnlineSTTModelType =
|
|
9
|
+
| 'transducer'
|
|
10
|
+
| 'paraformer'
|
|
11
|
+
| 'zipformer2_ctc'
|
|
12
|
+
| 'nemo_ctc'
|
|
13
|
+
| 'tone_ctc';
|
|
14
|
+
|
|
15
|
+
/** Runtime list of supported online STT model types. */
|
|
16
|
+
export const ONLINE_STT_MODEL_TYPES: readonly OnlineSTTModelType[] = [
|
|
17
|
+
'transducer',
|
|
18
|
+
'paraformer',
|
|
19
|
+
'zipformer2_ctc',
|
|
20
|
+
'nemo_ctc',
|
|
21
|
+
'tone_ctc',
|
|
22
|
+
] as const;
|
|
23
|
+
|
|
24
|
+
/**
|
|
25
|
+
* Single endpoint rule (Kotlin EndpointRule).
|
|
26
|
+
* Used to detect end of utterance in streaming recognition.
|
|
27
|
+
*/
|
|
28
|
+
export interface EndpointRule {
|
|
29
|
+
/** If true, rule only matches when the segment contains non-silence. */
|
|
30
|
+
mustContainNonSilence: boolean;
|
|
31
|
+
/** Minimum trailing silence in seconds. */
|
|
32
|
+
minTrailingSilence: number;
|
|
33
|
+
/** Minimum utterance length in seconds (e.g. max length cap). */
|
|
34
|
+
minUtteranceLength: number;
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
/**
|
|
38
|
+
* Endpoint detection config (Kotlin EndpointConfig).
|
|
39
|
+
* Three rules; first match determines end of utterance.
|
|
40
|
+
*/
|
|
41
|
+
export interface EndpointConfig {
|
|
42
|
+
/** Rule 1: e.g. 2.4s trailing silence, no speech required. */
|
|
43
|
+
rule1?: EndpointRule;
|
|
44
|
+
/** Rule 2: e.g. 1.4s trailing silence, speech required. */
|
|
45
|
+
rule2?: EndpointRule;
|
|
46
|
+
/** Rule 3: e.g. max utterance length 20s. */
|
|
47
|
+
rule3?: EndpointRule;
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
/**
|
|
51
|
+
* Options for initializing the streaming (online) STT engine.
|
|
52
|
+
*/
|
|
53
|
+
export interface StreamingSttInitOptions {
|
|
54
|
+
/** Model path configuration (asset, file, or auto). */
|
|
55
|
+
modelPath: ModelPathConfig;
|
|
56
|
+
/** Online model type. Use 'auto' to detect from model directory (calls detectSttModel and maps to an online type). */
|
|
57
|
+
modelType: OnlineSTTModelType | 'auto';
|
|
58
|
+
/** Enable endpoint detection. Default: true. */
|
|
59
|
+
enableEndpoint?: boolean;
|
|
60
|
+
/** Endpoint rules. Defaults match Kotlin (rule1: 2.4s silence, rule2: 1.4s + speech, rule3: 20s max). */
|
|
61
|
+
endpointConfig?: EndpointConfig;
|
|
62
|
+
/** Decoding method. Default: "greedy_search". */
|
|
63
|
+
decodingMethod?: 'greedy_search' | 'modified_beam_search';
|
|
64
|
+
/** Max active paths for beam search. Default: 4. */
|
|
65
|
+
maxActivePaths?: number;
|
|
66
|
+
/** Path to hotwords file (transducer/nemo_transducer). */
|
|
67
|
+
hotwordsFile?: string;
|
|
68
|
+
/** Hotwords score. Default: 1.5. */
|
|
69
|
+
hotwordsScore?: number;
|
|
70
|
+
/** Number of threads for inference. Default: 1. */
|
|
71
|
+
numThreads?: number;
|
|
72
|
+
/** Execution provider (e.g. "cpu"). */
|
|
73
|
+
provider?: string;
|
|
74
|
+
/** Path(s) to rule FSTs for ITN. */
|
|
75
|
+
ruleFsts?: string;
|
|
76
|
+
/** Path(s) to rule FARs for ITN. */
|
|
77
|
+
ruleFars?: string;
|
|
78
|
+
/** Blank penalty. */
|
|
79
|
+
blankPenalty?: number;
|
|
80
|
+
/** Enable debug logging. Default: false. */
|
|
81
|
+
debug?: boolean;
|
|
82
|
+
/**
|
|
83
|
+
* Enable adaptive input normalization for audio chunks in processAudioChunk().
|
|
84
|
+
* When true (default), input is scaled so the peak is ~0.8 to handle varying device levels (e.g. quiet mics on iOS).
|
|
85
|
+
* Set to false if your audio is already in the expected range [-1, 1] and you want to pass it through unchanged.
|
|
86
|
+
*/
|
|
87
|
+
enableInputNormalization?: boolean;
|
|
88
|
+
}
|
|
89
|
+
|
|
90
|
+
/**
|
|
91
|
+
* Partial or final recognition result from streaming STT (maps to Kotlin OnlineRecognizerResult).
|
|
92
|
+
*/
|
|
93
|
+
export interface StreamingSttResult {
|
|
94
|
+
text: string;
|
|
95
|
+
tokens: string[];
|
|
96
|
+
timestamps: number[];
|
|
97
|
+
}
|
|
98
|
+
|
|
99
|
+
/**
|
|
100
|
+
* Streaming STT stream. Created by StreamingSttEngine.createStream().
|
|
101
|
+
* Feeds audio via acceptWaveform, then decode / getResult / isEndpoint.
|
|
102
|
+
*/
|
|
103
|
+
export interface SttStream {
|
|
104
|
+
readonly streamId: string;
|
|
105
|
+
|
|
106
|
+
/** Feed PCM samples (float in [-1, 1]) to the stream. */
|
|
107
|
+
acceptWaveform(samples: number[], sampleRate: number): Promise<void>;
|
|
108
|
+
|
|
109
|
+
/** Signal that no more audio will be fed. */
|
|
110
|
+
inputFinished(): Promise<void>;
|
|
111
|
+
|
|
112
|
+
/** Run decoding on accumulated audio (call when isReady() is true). */
|
|
113
|
+
decode(): Promise<void>;
|
|
114
|
+
|
|
115
|
+
/** True if there is enough audio to decode. */
|
|
116
|
+
isReady(): Promise<boolean>;
|
|
117
|
+
|
|
118
|
+
/** Get current partial or final result. Call after decode(). */
|
|
119
|
+
getResult(): Promise<StreamingSttResult>;
|
|
120
|
+
|
|
121
|
+
/** True if endpoint (end of utterance) was detected. */
|
|
122
|
+
isEndpoint(): Promise<boolean>;
|
|
123
|
+
|
|
124
|
+
/** Reset stream state for reuse. */
|
|
125
|
+
reset(): Promise<void>;
|
|
126
|
+
|
|
127
|
+
/** Release native stream; do not use after this. */
|
|
128
|
+
release(): Promise<void>;
|
|
129
|
+
|
|
130
|
+
/**
|
|
131
|
+
* Convenience: feed audio, auto-decode while ready, return result and endpoint status.
|
|
132
|
+
* Reduces bridge round-trips from 5 to 1 per chunk.
|
|
133
|
+
*/
|
|
134
|
+
processAudioChunk(
|
|
135
|
+
samples: number[],
|
|
136
|
+
sampleRate: number
|
|
137
|
+
): Promise<{ result: StreamingSttResult; isEndpoint: boolean }>;
|
|
138
|
+
}
|
|
139
|
+
|
|
140
|
+
/**
|
|
141
|
+
* Streaming STT engine (OnlineRecognizer). Create via createStreamingSTT().
|
|
142
|
+
*/
|
|
143
|
+
export interface StreamingSttEngine {
|
|
144
|
+
readonly instanceId: string;
|
|
145
|
+
|
|
146
|
+
/** Create a new stream for this recognizer. Optional hotwords string. */
|
|
147
|
+
createStream(hotwords?: string): Promise<SttStream>;
|
|
148
|
+
|
|
149
|
+
/** Release native recognizer and all streams. */
|
|
150
|
+
destroy(): Promise<void>;
|
|
151
|
+
}
|
package/src/tts/index.ts
CHANGED
|
@@ -1,4 +1,3 @@
|
|
|
1
|
-
import { DeviceEventEmitter } from 'react-native';
|
|
2
1
|
import SherpaOnnx from '../NativeSherpaOnnx';
|
|
3
2
|
import type {
|
|
4
3
|
TTSInitializeOptions,
|
|
@@ -10,10 +9,6 @@ import type {
|
|
|
10
9
|
GeneratedAudioWithTimestamps,
|
|
11
10
|
TTSModelInfo,
|
|
12
11
|
TtsEngine,
|
|
13
|
-
TtsStreamChunk,
|
|
14
|
-
TtsStreamEnd,
|
|
15
|
-
TtsStreamError,
|
|
16
|
-
TtsStreamHandlers,
|
|
17
12
|
} from './types';
|
|
18
13
|
import type { ModelPathConfig } from '../types';
|
|
19
14
|
import { resolveModelPath } from '../utils';
|
|
@@ -135,7 +130,7 @@ function toNativeTtsOptions(
|
|
|
135
130
|
}
|
|
136
131
|
|
|
137
132
|
// TTS stream events are sent from native via sendEventWithName; use DeviceEventEmitter
|
|
138
|
-
|
|
133
|
+
|
|
139
134
|
/**
|
|
140
135
|
* Create a TTS engine instance. Call destroy() on the returned engine when done to free native resources.
|
|
141
136
|
*
|
|
@@ -260,66 +255,6 @@ export async function createTTS(
|
|
|
260
255
|
);
|
|
261
256
|
},
|
|
262
257
|
|
|
263
|
-
async generateSpeechStream(
|
|
264
|
-
text: string,
|
|
265
|
-
opts: TtsGenerationOptions | undefined,
|
|
266
|
-
handlers: TtsStreamHandlers
|
|
267
|
-
): Promise<() => void> {
|
|
268
|
-
guard();
|
|
269
|
-
const subscriptions = [
|
|
270
|
-
DeviceEventEmitter.addListener('ttsStreamChunk', (event: unknown) => {
|
|
271
|
-
const e = event as TtsStreamChunk;
|
|
272
|
-
if (e.instanceId != null && e.instanceId !== instanceId) return;
|
|
273
|
-
handlers.onChunk?.(e);
|
|
274
|
-
}),
|
|
275
|
-
DeviceEventEmitter.addListener('ttsStreamEnd', (event: unknown) => {
|
|
276
|
-
const e = event as TtsStreamEnd;
|
|
277
|
-
if (e.instanceId != null && e.instanceId !== instanceId) return;
|
|
278
|
-
handlers.onEnd?.(e);
|
|
279
|
-
}),
|
|
280
|
-
DeviceEventEmitter.addListener('ttsStreamError', (event: unknown) => {
|
|
281
|
-
const e = event as TtsStreamError;
|
|
282
|
-
if (e.instanceId != null && e.instanceId !== instanceId) return;
|
|
283
|
-
handlers.onError?.(e);
|
|
284
|
-
}),
|
|
285
|
-
];
|
|
286
|
-
|
|
287
|
-
try {
|
|
288
|
-
await SherpaOnnx.generateTtsStream(
|
|
289
|
-
instanceId,
|
|
290
|
-
text,
|
|
291
|
-
toNativeTtsOptions(opts)
|
|
292
|
-
);
|
|
293
|
-
} catch (error) {
|
|
294
|
-
subscriptions.forEach((sub) => sub.remove());
|
|
295
|
-
throw error;
|
|
296
|
-
}
|
|
297
|
-
|
|
298
|
-
return () => {
|
|
299
|
-
subscriptions.forEach((sub) => sub.remove());
|
|
300
|
-
};
|
|
301
|
-
},
|
|
302
|
-
|
|
303
|
-
async cancelSpeechStream(): Promise<void> {
|
|
304
|
-
guard();
|
|
305
|
-
return SherpaOnnx.cancelTtsStream(instanceId);
|
|
306
|
-
},
|
|
307
|
-
|
|
308
|
-
async startPcmPlayer(sampleRate: number, channels: number): Promise<void> {
|
|
309
|
-
guard();
|
|
310
|
-
return SherpaOnnx.startTtsPcmPlayer(instanceId, sampleRate, channels);
|
|
311
|
-
},
|
|
312
|
-
|
|
313
|
-
async writePcmChunk(samples: number[]): Promise<void> {
|
|
314
|
-
guard();
|
|
315
|
-
return SherpaOnnx.writeTtsPcmChunk(instanceId, samples);
|
|
316
|
-
},
|
|
317
|
-
|
|
318
|
-
async stopPcmPlayer(): Promise<void> {
|
|
319
|
-
guard();
|
|
320
|
-
return SherpaOnnx.stopTtsPcmPlayer(instanceId);
|
|
321
|
-
},
|
|
322
|
-
|
|
323
258
|
async updateParams(opts: TtsUpdateOptions): Promise<{
|
|
324
259
|
success: boolean;
|
|
325
260
|
detectedModels: Array<{ type: string; modelDir: string }>;
|
|
@@ -445,6 +380,10 @@ export function shareAudioFile(
|
|
|
445
380
|
return SherpaOnnx.shareTtsAudio(fileUri, mimeType);
|
|
446
381
|
}
|
|
447
382
|
|
|
383
|
+
// Streaming TTS (separate engine; use createStreamingTTS for chunk callbacks and PCM playback)
|
|
384
|
+
export { createStreamingTTS } from './streaming';
|
|
385
|
+
export type { StreamingTtsEngine } from './streamingTypes';
|
|
386
|
+
|
|
448
387
|
// Export types and runtime type list
|
|
449
388
|
export type {
|
|
450
389
|
TTSInitializeOptions,
|
|
@@ -462,6 +401,7 @@ export type {
|
|
|
462
401
|
TtsSubtitleItem,
|
|
463
402
|
TTSModelInfo,
|
|
464
403
|
TtsEngine,
|
|
404
|
+
TtsStreamController,
|
|
465
405
|
TtsStreamHandlers,
|
|
466
406
|
TtsStreamChunk,
|
|
467
407
|
TtsStreamEnd,
|