react-native-sherpa-onnx 0.2.0 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +232 -236
- package/SherpaOnnx.podspec +68 -64
- package/android/build.gradle +182 -192
- package/android/codegen.gradle +57 -0
- package/android/prebuilt-download.gradle +428 -0
- package/android/prebuilt-versions.gradle +43 -0
- package/android/proguard-rules.pro +10 -0
- package/android/src/main/assets/testModels/add_mul_add.onnx +28 -0
- package/android/src/main/assets/testModels/nnapi_internal_uint8_support.onnx +0 -0
- package/android/src/main/assets/testModels/qnn_multi_ctx_embed.onnx +0 -0
- package/android/src/main/cpp/CMakeLists.txt +166 -129
- package/android/src/main/cpp/CMakePresets.json +54 -0
- package/android/src/main/cpp/crypto/sha256.cpp +174 -0
- package/android/src/main/cpp/crypto/sha256.h +16 -0
- package/android/src/main/cpp/jni/archive/sherpa-onnx-archive-helper.cpp +404 -0
- package/android/src/main/cpp/jni/archive/sherpa-onnx-archive-helper.h +56 -0
- package/android/src/main/cpp/jni/archive/sherpa-onnx-archive-jni.cpp +181 -0
- package/android/src/main/cpp/jni/audio/sherpa-onnx-audio-convert-jni.cpp +888 -0
- package/{ios → android/src/main/cpp/jni/model_detect}/sherpa-onnx-common.h +18 -18
- package/android/src/main/cpp/jni/model_detect/sherpa-onnx-detect-jni-common.cpp +86 -0
- package/android/src/main/cpp/jni/model_detect/sherpa-onnx-detect-jni-common.h +20 -0
- package/android/src/main/cpp/jni/model_detect/sherpa-onnx-model-detect-helper.cpp +423 -0
- package/android/src/main/cpp/jni/model_detect/sherpa-onnx-model-detect-helper.h +55 -0
- package/android/src/main/cpp/jni/model_detect/sherpa-onnx-model-detect-stt.cpp +399 -0
- package/android/src/main/cpp/jni/model_detect/sherpa-onnx-model-detect-tts.cpp +238 -0
- package/{ios → android/src/main/cpp/jni/model_detect}/sherpa-onnx-model-detect.h +122 -89
- package/android/src/main/cpp/jni/model_detect/sherpa-onnx-stt-wrapper.cpp +99 -0
- package/android/src/main/cpp/jni/model_detect/sherpa-onnx-stt-wrapper.h +16 -0
- package/android/src/main/cpp/jni/model_detect/sherpa-onnx-tts-wrapper.cpp +78 -0
- package/android/src/main/cpp/jni/model_detect/sherpa-onnx-tts-wrapper.h +16 -0
- package/android/src/main/cpp/jni/module/sherpa-onnx-module-jni.cpp +190 -0
- package/android/src/main/cpp/jni/tts/sherpa-onnx-tts-zipvoice-jni.cpp +301 -0
- package/android/src/main/java/com/sherpaonnx/SherpaOnnxArchiveHelper.kt +94 -0
- package/android/src/main/java/com/sherpaonnx/{SherpaOnnxCoreHelper.kt → SherpaOnnxAssetHelper.kt} +350 -236
- package/android/src/main/java/com/sherpaonnx/SherpaOnnxModule.kt +791 -483
- package/android/src/main/java/com/sherpaonnx/SherpaOnnxSttHelper.kt +699 -109
- package/android/src/main/java/com/sherpaonnx/SherpaOnnxTtsHelper.kt +1123 -668
- package/android/src/main/java/com/sherpaonnx/ZipvoiceTtsWrapper.kt +187 -0
- package/ios/SherpaOnnx+Assets.h +11 -0
- package/ios/SherpaOnnx+Assets.mm +325 -0
- package/ios/SherpaOnnx+STT.mm +455 -118
- package/ios/SherpaOnnx+TTS.mm +1101 -712
- package/ios/SherpaOnnx.h +17 -6
- package/ios/SherpaOnnx.mm +206 -311
- package/ios/SherpaOnnx.xcconfig +19 -19
- package/ios/SherpaOnnxCoreMLHelper.swift +24 -0
- package/ios/archive/sherpa-onnx-archive-helper.h +21 -0
- package/ios/archive/sherpa-onnx-archive-helper.mm +296 -0
- package/ios/libarchive_darwin_config.h +153 -0
- package/{android/src/main/cpp/jni → ios/model_detect}/sherpa-onnx-common.h +18 -18
- package/ios/model_detect/sherpa-onnx-model-detect-helper.h +49 -0
- package/ios/model_detect/sherpa-onnx-model-detect-helper.mm +210 -0
- package/ios/model_detect/sherpa-onnx-model-detect-stt.mm +344 -0
- package/ios/model_detect/sherpa-onnx-model-detect-tts.mm +201 -0
- package/{android/src/main/cpp/jni → ios/model_detect}/sherpa-onnx-model-detect.h +117 -89
- package/ios/scripts/patch-libarchive-includes.sh +61 -0
- package/ios/scripts/setup-ios-libarchive.sh +98 -0
- package/ios/stt/sherpa-onnx-stt-wrapper.h +129 -0
- package/ios/stt/sherpa-onnx-stt-wrapper.mm +523 -0
- package/ios/{sherpa-onnx-tts-wrapper.h → tts/sherpa-onnx-tts-wrapper.h} +90 -85
- package/ios/{sherpa-onnx-tts-wrapper.mm → tts/sherpa-onnx-tts-wrapper.mm} +376 -345
- package/lib/module/NativeSherpaOnnx.js +3 -0
- package/lib/module/NativeSherpaOnnx.js.map +1 -1
- package/lib/module/audio/index.js +22 -0
- package/lib/module/audio/index.js.map +1 -0
- package/lib/module/diarization/index.js +1 -1
- package/lib/module/diarization/index.js.map +1 -1
- package/lib/module/download/ModelDownloadManager.js +918 -0
- package/lib/module/download/ModelDownloadManager.js.map +1 -0
- package/lib/module/download/extractTarBz2.js +53 -0
- package/lib/module/download/extractTarBz2.js.map +1 -0
- package/lib/module/download/index.js +6 -0
- package/lib/module/download/index.js.map +1 -0
- package/lib/module/download/validation.js +178 -0
- package/lib/module/download/validation.js.map +1 -0
- package/lib/module/enhancement/index.js +1 -1
- package/lib/module/enhancement/index.js.map +1 -1
- package/lib/module/index.js +41 -3
- package/lib/module/index.js.map +1 -1
- package/lib/module/separation/index.js +1 -1
- package/lib/module/separation/index.js.map +1 -1
- package/lib/module/stt/index.js +127 -60
- package/lib/module/stt/index.js.map +1 -1
- package/lib/module/stt/sttModelLanguages.js +512 -0
- package/lib/module/stt/sttModelLanguages.js.map +1 -0
- package/lib/module/stt/types.js +53 -1
- package/lib/module/stt/types.js.map +1 -1
- package/lib/module/tts/index.js +216 -289
- package/lib/module/tts/index.js.map +1 -1
- package/lib/module/tts/types.js +86 -1
- package/lib/module/tts/types.js.map +1 -1
- package/lib/module/types.js.map +1 -1
- package/lib/module/utils.js +86 -73
- package/lib/module/utils.js.map +1 -1
- package/lib/module/vad/index.js +1 -1
- package/lib/module/vad/index.js.map +1 -1
- package/lib/typescript/src/NativeSherpaOnnx.d.ts +192 -38
- package/lib/typescript/src/NativeSherpaOnnx.d.ts.map +1 -1
- package/lib/typescript/src/audio/index.d.ts +13 -0
- package/lib/typescript/src/audio/index.d.ts.map +1 -0
- package/lib/typescript/src/diarization/index.d.ts +3 -2
- package/lib/typescript/src/diarization/index.d.ts.map +1 -1
- package/lib/typescript/src/download/ModelDownloadManager.d.ts +108 -0
- package/lib/typescript/src/download/ModelDownloadManager.d.ts.map +1 -0
- package/lib/typescript/src/download/extractTarBz2.d.ts +14 -0
- package/lib/typescript/src/download/extractTarBz2.d.ts.map +1 -0
- package/lib/typescript/src/download/index.d.ts +7 -0
- package/lib/typescript/src/download/index.d.ts.map +1 -0
- package/lib/typescript/src/download/validation.d.ts +57 -0
- package/lib/typescript/src/download/validation.d.ts.map +1 -0
- package/lib/typescript/src/enhancement/index.d.ts +3 -2
- package/lib/typescript/src/enhancement/index.d.ts.map +1 -1
- package/lib/typescript/src/index.d.ts +26 -2
- package/lib/typescript/src/index.d.ts.map +1 -1
- package/lib/typescript/src/separation/index.d.ts +3 -2
- package/lib/typescript/src/separation/index.d.ts.map +1 -1
- package/lib/typescript/src/stt/index.d.ts +31 -43
- package/lib/typescript/src/stt/index.d.ts.map +1 -1
- package/lib/typescript/src/stt/sttModelLanguages.d.ts +52 -0
- package/lib/typescript/src/stt/sttModelLanguages.d.ts.map +1 -0
- package/lib/typescript/src/stt/types.d.ts +196 -9
- package/lib/typescript/src/stt/types.d.ts.map +1 -1
- package/lib/typescript/src/tts/index.d.ts +25 -211
- package/lib/typescript/src/tts/index.d.ts.map +1 -1
- package/lib/typescript/src/tts/types.d.ts +148 -25
- package/lib/typescript/src/tts/types.d.ts.map +1 -1
- package/lib/typescript/src/types.d.ts +0 -32
- package/lib/typescript/src/types.d.ts.map +1 -1
- package/lib/typescript/src/utils.d.ts +28 -13
- package/lib/typescript/src/utils.d.ts.map +1 -1
- package/lib/typescript/src/vad/index.d.ts +3 -2
- package/lib/typescript/src/vad/index.d.ts.map +1 -1
- package/package.json +250 -222
- package/scripts/check-qnn-support.sh +78 -0
- package/scripts/setup-ios-framework.sh +379 -282
- package/src/NativeSherpaOnnx.ts +474 -251
- package/src/audio/index.ts +32 -0
- package/src/diarization/index.ts +4 -2
- package/src/download/ModelDownloadManager.ts +1325 -0
- package/src/download/extractTarBz2.ts +78 -0
- package/src/download/index.ts +43 -0
- package/src/download/validation.ts +279 -0
- package/src/enhancement/index.ts +4 -2
- package/src/index.tsx +78 -27
- package/src/separation/index.ts +4 -2
- package/src/stt/index.ts +249 -89
- package/src/stt/sttModelLanguages.ts +237 -0
- package/src/stt/types.ts +263 -9
- package/src/tts/index.ts +470 -458
- package/src/tts/types.ts +373 -218
- package/src/types.ts +0 -44
- package/src/utils.ts +145 -131
- package/src/vad/index.ts +4 -2
- package/third_party/ffmpeg_prebuilt/ANDROID_RELEASE_TAG +1 -0
- package/third_party/libarchive_prebuilt/ANDROID_RELEASE_TAG +1 -0
- package/third_party/libarchive_prebuilt/IOS_RELEASE_TAG +1 -0
- package/third_party/sherpa-onnx-prebuilt/ANDROID_RELEASE_TAG +1 -0
- package/third_party/sherpa-onnx-prebuilt/IOS_RELEASE_TAG +1 -0
- package/android/src/main/cpp/include/sherpa-onnx/c-api/c-api.h +0 -1918
- package/android/src/main/cpp/include/sherpa-onnx/c-api/cxx-api.h +0 -841
- package/android/src/main/cpp/jni/sherpa-onnx-model-detect.cpp +0 -541
- package/android/src/main/cpp/jni/sherpa-onnx-stt-jni.cpp +0 -336
- package/android/src/main/cpp/jni/sherpa-onnx-stt-wrapper.cpp +0 -222
- package/android/src/main/cpp/jni/sherpa-onnx-stt-wrapper.h +0 -68
- package/android/src/main/cpp/jni/sherpa-onnx-tts-jni.cpp +0 -823
- package/android/src/main/cpp/jni/sherpa-onnx-tts-wrapper.cpp +0 -387
- package/android/src/main/cpp/jni/sherpa-onnx-tts-wrapper.h +0 -147
- package/ios/Frameworks/sherpa_onnx.xcframework.zip +0 -0
- package/ios/include/sherpa-onnx/c-api/c-api.h +0 -1918
- package/ios/include/sherpa-onnx/c-api/cxx-api.h +0 -841
- package/ios/sherpa-onnx-model-detect.mm +0 -441
- package/ios/sherpa-onnx-stt-wrapper.h +0 -48
- package/ios/sherpa-onnx-stt-wrapper.mm +0 -201
- package/scripts/copy-headers.js +0 -184
- package/scripts/setup-assets.js +0 -323
package/src/tts/types.ts
CHANGED
|
@@ -1,218 +1,373 @@
|
|
|
1
|
-
import type { ModelPathConfig } from '../types';
|
|
2
|
-
|
|
3
|
-
/**
|
|
4
|
-
* Supported TTS model types.
|
|
5
|
-
*
|
|
6
|
-
* - 'vits': VITS models (includes Piper, Coqui, MeloTTS, MMS variants)
|
|
7
|
-
* - 'matcha': Matcha models (acoustic model + vocoder)
|
|
8
|
-
* - 'kokoro': Kokoro models (multi-speaker, multi-language)
|
|
9
|
-
* - 'kitten': KittenTTS models (lightweight, multi-speaker)
|
|
10
|
-
* - '
|
|
11
|
-
* - '
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
| '
|
|
16
|
-
| '
|
|
17
|
-
| '
|
|
18
|
-
| '
|
|
19
|
-
| '
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
*/
|
|
24
|
-
export
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
/**
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
/**
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
/**
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
*
|
|
104
|
-
*
|
|
105
|
-
*
|
|
106
|
-
*
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
*
|
|
113
|
-
*
|
|
114
|
-
*
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
/**
|
|
132
|
-
*
|
|
133
|
-
*
|
|
134
|
-
*/
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
/**
|
|
138
|
-
*
|
|
139
|
-
*
|
|
140
|
-
*/
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
*
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
1
|
+
import type { ModelPathConfig } from '../types';
|
|
2
|
+
|
|
3
|
+
/**
|
|
4
|
+
* Supported TTS model types.
|
|
5
|
+
*
|
|
6
|
+
* - 'vits': VITS models (includes Piper, Coqui, MeloTTS, MMS variants)
|
|
7
|
+
* - 'matcha': Matcha models (acoustic model + vocoder)
|
|
8
|
+
* - 'kokoro': Kokoro models (multi-speaker, multi-language)
|
|
9
|
+
* - 'kitten': KittenTTS models (lightweight, multi-speaker)
|
|
10
|
+
* - 'pocket': Pocket TTS models
|
|
11
|
+
* - 'zipvoice': Zipvoice models (voice cloning capable)
|
|
12
|
+
* - 'auto': Auto-detect model type based on files present (default)
|
|
13
|
+
*/
|
|
14
|
+
export type TTSModelType =
|
|
15
|
+
| 'vits'
|
|
16
|
+
| 'matcha'
|
|
17
|
+
| 'kokoro'
|
|
18
|
+
| 'kitten'
|
|
19
|
+
| 'pocket'
|
|
20
|
+
| 'zipvoice'
|
|
21
|
+
| 'auto';
|
|
22
|
+
|
|
23
|
+
/** Runtime list of supported TTS model types. */
|
|
24
|
+
export const TTS_MODEL_TYPES: readonly TTSModelType[] = [
|
|
25
|
+
'vits',
|
|
26
|
+
'matcha',
|
|
27
|
+
'kokoro',
|
|
28
|
+
'kitten',
|
|
29
|
+
'pocket',
|
|
30
|
+
'zipvoice',
|
|
31
|
+
'auto',
|
|
32
|
+
] as const;
|
|
33
|
+
|
|
34
|
+
// ========== Model-specific options (only applied when that model type is loaded) ==========
|
|
35
|
+
|
|
36
|
+
/** Options for VITS models. Applied only when modelType is 'vits'. Kotlin OfflineTtsVitsModelConfig. */
|
|
37
|
+
export interface TtsVitsModelOptions {
|
|
38
|
+
/** Noise scale. If omitted, model default (or model.json) is used. */
|
|
39
|
+
noiseScale?: number;
|
|
40
|
+
/** Noise scale W. If omitted, model default is used. */
|
|
41
|
+
noiseScaleW?: number;
|
|
42
|
+
/** Length scale. If omitted, model default is used. */
|
|
43
|
+
lengthScale?: number;
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
/** Options for Matcha models. Applied only when modelType is 'matcha'. Kotlin OfflineTtsMatchaModelConfig. */
|
|
47
|
+
export interface TtsMatchaModelOptions {
|
|
48
|
+
/** Noise scale. If omitted, model default is used. */
|
|
49
|
+
noiseScale?: number;
|
|
50
|
+
/** Length scale. If omitted, model default is used. */
|
|
51
|
+
lengthScale?: number;
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
/** Options for Kokoro models. Applied only when modelType is 'kokoro'. Kotlin OfflineTtsKokoroModelConfig. */
|
|
55
|
+
export interface TtsKokoroModelOptions {
|
|
56
|
+
/** Length scale. If omitted, model default is used. */
|
|
57
|
+
lengthScale?: number;
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
/** Options for KittenTTS models. Applied only when modelType is 'kitten'. Kotlin OfflineTtsKittenModelConfig. */
|
|
61
|
+
export interface TtsKittenModelOptions {
|
|
62
|
+
/** Length scale. If omitted, model default is used. */
|
|
63
|
+
lengthScale?: number;
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
/** Options for Pocket TTS models. Applied only when modelType is 'pocket'. Kotlin has no init-time model config for pocket; reserved for future use. */
|
|
67
|
+
export interface TtsPocketModelOptions {
|
|
68
|
+
// No init-time options in Kotlin OfflineTtsPocketModelConfig; voice cloning is via GenerationConfig at generate time.
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
/**
|
|
72
|
+
* Model-specific TTS options. Only the block for the actually loaded model type is applied;
|
|
73
|
+
* others are ignored (e.g. vits options have no effect when a kokoro model is loaded).
|
|
74
|
+
*/
|
|
75
|
+
export interface TtsModelOptions {
|
|
76
|
+
vits?: TtsVitsModelOptions;
|
|
77
|
+
matcha?: TtsMatchaModelOptions;
|
|
78
|
+
kokoro?: TtsKokoroModelOptions;
|
|
79
|
+
kitten?: TtsKittenModelOptions;
|
|
80
|
+
pocket?: TtsPocketModelOptions;
|
|
81
|
+
}
|
|
82
|
+
|
|
83
|
+
/**
|
|
84
|
+
* Configuration for TTS initialization.
|
|
85
|
+
*/
|
|
86
|
+
export interface TTSInitializeOptions {
|
|
87
|
+
/**
|
|
88
|
+
* Path to the model directory.
|
|
89
|
+
* Can be an asset path, file system path, or auto-detection path.
|
|
90
|
+
*/
|
|
91
|
+
modelPath: ModelPathConfig;
|
|
92
|
+
|
|
93
|
+
/**
|
|
94
|
+
* Model type to use.
|
|
95
|
+
* If not specified or 'auto', the model type will be auto-detected
|
|
96
|
+
* based on the files present in the model directory.
|
|
97
|
+
*
|
|
98
|
+
* @default 'auto'
|
|
99
|
+
*/
|
|
100
|
+
modelType?: TTSModelType;
|
|
101
|
+
|
|
102
|
+
/**
|
|
103
|
+
* Execution provider (e.g. `'cpu'`, `'coreml'`, `'xnnpack'`, `'nnapi'`, `'qnn'`).
|
|
104
|
+
* Use getCoreMlSupport(), getXnnpackSupport(), etc. to check availability. See execution-providers.md.
|
|
105
|
+
*
|
|
106
|
+
* @default 'cpu'
|
|
107
|
+
*/
|
|
108
|
+
provider?: string;
|
|
109
|
+
|
|
110
|
+
/**
|
|
111
|
+
* Number of threads to use for inference.
|
|
112
|
+
* More threads = faster processing but more CPU usage.
|
|
113
|
+
*
|
|
114
|
+
* @default 2
|
|
115
|
+
*/
|
|
116
|
+
numThreads?: number;
|
|
117
|
+
|
|
118
|
+
/**
|
|
119
|
+
* Enable debug logging from the TTS engine.
|
|
120
|
+
*
|
|
121
|
+
* @default false
|
|
122
|
+
*/
|
|
123
|
+
debug?: boolean;
|
|
124
|
+
|
|
125
|
+
/**
|
|
126
|
+
* Model-specific options. Only options for the loaded model type are applied.
|
|
127
|
+
* E.g. when modelType is 'vits', only modelOptions.vits is used.
|
|
128
|
+
*/
|
|
129
|
+
modelOptions?: TtsModelOptions;
|
|
130
|
+
|
|
131
|
+
/**
|
|
132
|
+
* Path(s) to rule FSTs for TTS (OfflineTtsConfig.ruleFsts).
|
|
133
|
+
* Used for text normalization / ITN.
|
|
134
|
+
*/
|
|
135
|
+
ruleFsts?: string;
|
|
136
|
+
|
|
137
|
+
/**
|
|
138
|
+
* Path(s) to rule FARs for TTS (OfflineTtsConfig.ruleFars).
|
|
139
|
+
* Used for text normalization / ITN.
|
|
140
|
+
*/
|
|
141
|
+
ruleFars?: string;
|
|
142
|
+
|
|
143
|
+
/**
|
|
144
|
+
* Max number of sentences per streaming callback (OfflineTtsConfig.maxNumSentences).
|
|
145
|
+
* Default: 1.
|
|
146
|
+
*/
|
|
147
|
+
maxNumSentences?: number;
|
|
148
|
+
|
|
149
|
+
/**
|
|
150
|
+
* Silence scale on config level (OfflineTtsConfig.silenceScale).
|
|
151
|
+
* Default: 0.2.
|
|
152
|
+
*/
|
|
153
|
+
silenceScale?: number;
|
|
154
|
+
}
|
|
155
|
+
|
|
156
|
+
/**
|
|
157
|
+
* Options for updating TTS model parameters at runtime.
|
|
158
|
+
* Only the block for the given modelType is applied; flattened to native noiseScale / noiseScaleW / lengthScale.
|
|
159
|
+
*/
|
|
160
|
+
export interface TtsUpdateOptions {
|
|
161
|
+
/**
|
|
162
|
+
* Model type currently loaded. When omitted or 'auto', the SDK uses the model type from the last
|
|
163
|
+
* successful initializeTTS(). After unloadTTS(), pass modelType explicitly until init is called again.
|
|
164
|
+
*/
|
|
165
|
+
modelType?: TTSModelType;
|
|
166
|
+
|
|
167
|
+
/**
|
|
168
|
+
* Model-specific options. Only the block for the effective model type is used (e.g. modelOptions.vits when type is 'vits').
|
|
169
|
+
*/
|
|
170
|
+
modelOptions?: TtsModelOptions;
|
|
171
|
+
}
|
|
172
|
+
|
|
173
|
+
/**
|
|
174
|
+
* Options for TTS generation. Maps to Kotlin GenerationConfig when reference
|
|
175
|
+
* audio or advanced options are used; otherwise simple sid/speed are used.
|
|
176
|
+
*/
|
|
177
|
+
export interface TtsGenerationOptions {
|
|
178
|
+
/**
|
|
179
|
+
* Speaker ID for multi-speaker models.
|
|
180
|
+
* For single-speaker models, this is ignored.
|
|
181
|
+
*
|
|
182
|
+
* Use `getNumSpeakers()` to check how many speakers are available.
|
|
183
|
+
*
|
|
184
|
+
* @default 0
|
|
185
|
+
*/
|
|
186
|
+
sid?: number;
|
|
187
|
+
|
|
188
|
+
/**
|
|
189
|
+
* Speech speed multiplier.
|
|
190
|
+
*
|
|
191
|
+
* - 1.0 = normal speed
|
|
192
|
+
* - 0.5 = half speed (slower)
|
|
193
|
+
* - 2.0 = double speed (faster)
|
|
194
|
+
*
|
|
195
|
+
* @default 1.0
|
|
196
|
+
*/
|
|
197
|
+
speed?: number;
|
|
198
|
+
|
|
199
|
+
/**
|
|
200
|
+
* Silence scale (Kotlin GenerationConfig.silenceScale). Used at generate time.
|
|
201
|
+
*/
|
|
202
|
+
silenceScale?: number;
|
|
203
|
+
|
|
204
|
+
/**
|
|
205
|
+
* Reference audio for voice cloning (Kotlin GenerationConfig).
|
|
206
|
+
* In the Kotlin/RN stack, only Pocket TTS uses this; other model types (vits, matcha, kokoro, kitten) ignore it.
|
|
207
|
+
* Mono float samples in [-1, 1] and sample rate in Hz.
|
|
208
|
+
*/
|
|
209
|
+
referenceAudio?: { samples: number[]; sampleRate: number };
|
|
210
|
+
|
|
211
|
+
/**
|
|
212
|
+
* Transcript text of the reference audio (Kotlin GenerationConfig.referenceText).
|
|
213
|
+
* Required for Pocket TTS when referenceAudio is provided; ignored by other model types.
|
|
214
|
+
*/
|
|
215
|
+
referenceText?: string;
|
|
216
|
+
|
|
217
|
+
/**
|
|
218
|
+
* Number of steps, e.g. flow-matching steps (Kotlin GenerationConfig.numSteps).
|
|
219
|
+
* Used by models such as Pocket.
|
|
220
|
+
*/
|
|
221
|
+
numSteps?: number;
|
|
222
|
+
|
|
223
|
+
/**
|
|
224
|
+
* Extra options as key-value pairs (Kotlin GenerationConfig.extra).
|
|
225
|
+
* Model-specific (e.g. temperature, chunk_size for Pocket).
|
|
226
|
+
*/
|
|
227
|
+
extra?: Record<string, string>;
|
|
228
|
+
}
|
|
229
|
+
|
|
230
|
+
/**
|
|
231
|
+
* Generated audio data from TTS synthesis.
|
|
232
|
+
*
|
|
233
|
+
* The samples are normalized float values in the range [-1.0, 1.0].
|
|
234
|
+
* To save as a WAV file or play the audio, you'll need to convert
|
|
235
|
+
* these samples to the appropriate format for your use case.
|
|
236
|
+
*/
|
|
237
|
+
export interface GeneratedAudio {
|
|
238
|
+
/**
|
|
239
|
+
* Audio samples as an array of float values in range [-1.0, 1.0].
|
|
240
|
+
* This is raw PCM audio data.
|
|
241
|
+
*/
|
|
242
|
+
samples: number[];
|
|
243
|
+
|
|
244
|
+
/**
|
|
245
|
+
* Sample rate of the generated audio in Hz.
|
|
246
|
+
* Common values: 16000, 22050, 44100, 48000
|
|
247
|
+
*/
|
|
248
|
+
sampleRate: number;
|
|
249
|
+
}
|
|
250
|
+
|
|
251
|
+
/**
|
|
252
|
+
* Subtitle/timestamp item for synthesized speech.
|
|
253
|
+
*/
|
|
254
|
+
export interface TtsSubtitleItem {
|
|
255
|
+
/**
|
|
256
|
+
* Text token for this time range.
|
|
257
|
+
*/
|
|
258
|
+
text: string;
|
|
259
|
+
|
|
260
|
+
/**
|
|
261
|
+
* Start time in seconds.
|
|
262
|
+
*/
|
|
263
|
+
start: number;
|
|
264
|
+
|
|
265
|
+
/**
|
|
266
|
+
* End time in seconds.
|
|
267
|
+
*/
|
|
268
|
+
end: number;
|
|
269
|
+
}
|
|
270
|
+
|
|
271
|
+
/**
|
|
272
|
+
* Generated audio with subtitle/timestamp metadata.
|
|
273
|
+
*/
|
|
274
|
+
export interface GeneratedAudioWithTimestamps extends GeneratedAudio {
|
|
275
|
+
/**
|
|
276
|
+
* Subtitle/timestamp entries.
|
|
277
|
+
*/
|
|
278
|
+
subtitles: TtsSubtitleItem[];
|
|
279
|
+
|
|
280
|
+
/**
|
|
281
|
+
* True if timestamps are estimated rather than model-provided.
|
|
282
|
+
*/
|
|
283
|
+
estimated: boolean;
|
|
284
|
+
}
|
|
285
|
+
|
|
286
|
+
/**
|
|
287
|
+
* Streaming chunk event payload for TTS generation.
|
|
288
|
+
*/
|
|
289
|
+
export interface TtsStreamChunk {
|
|
290
|
+
/** Instance ID (set by native for multi-instance routing). */
|
|
291
|
+
instanceId?: string;
|
|
292
|
+
samples: number[];
|
|
293
|
+
sampleRate: number;
|
|
294
|
+
progress: number;
|
|
295
|
+
isFinal: boolean;
|
|
296
|
+
}
|
|
297
|
+
|
|
298
|
+
/**
|
|
299
|
+
* Streaming end event payload.
|
|
300
|
+
*/
|
|
301
|
+
export interface TtsStreamEnd {
|
|
302
|
+
/** Instance ID (set by native for multi-instance routing). */
|
|
303
|
+
instanceId?: string;
|
|
304
|
+
cancelled: boolean;
|
|
305
|
+
}
|
|
306
|
+
|
|
307
|
+
/**
|
|
308
|
+
* Streaming error event payload.
|
|
309
|
+
*/
|
|
310
|
+
export interface TtsStreamError {
|
|
311
|
+
/** Instance ID (set by native for multi-instance routing). */
|
|
312
|
+
instanceId?: string;
|
|
313
|
+
message: string;
|
|
314
|
+
}
|
|
315
|
+
|
|
316
|
+
/**
|
|
317
|
+
* Handlers for TTS streaming generation (chunk, end, error).
|
|
318
|
+
*/
|
|
319
|
+
export interface TtsStreamHandlers {
|
|
320
|
+
onChunk?: (chunk: TtsStreamChunk) => void;
|
|
321
|
+
onEnd?: (event: TtsStreamEnd) => void;
|
|
322
|
+
onError?: (event: TtsStreamError) => void;
|
|
323
|
+
}
|
|
324
|
+
|
|
325
|
+
/**
|
|
326
|
+
* Instance-based TTS engine returned by createTTS().
|
|
327
|
+
* Call destroy() when done to free native resources.
|
|
328
|
+
*/
|
|
329
|
+
export interface TtsEngine {
|
|
330
|
+
readonly instanceId: string;
|
|
331
|
+
generateSpeech(
|
|
332
|
+
text: string,
|
|
333
|
+
options?: TtsGenerationOptions
|
|
334
|
+
): Promise<GeneratedAudio>;
|
|
335
|
+
generateSpeechWithTimestamps(
|
|
336
|
+
text: string,
|
|
337
|
+
options?: TtsGenerationOptions
|
|
338
|
+
): Promise<GeneratedAudioWithTimestamps>;
|
|
339
|
+
generateSpeechStream(
|
|
340
|
+
text: string,
|
|
341
|
+
options: TtsGenerationOptions | undefined,
|
|
342
|
+
handlers: TtsStreamHandlers
|
|
343
|
+
): Promise<() => void>;
|
|
344
|
+
cancelSpeechStream(): Promise<void>;
|
|
345
|
+
startPcmPlayer(sampleRate: number, channels: number): Promise<void>;
|
|
346
|
+
writePcmChunk(samples: number[]): Promise<void>;
|
|
347
|
+
stopPcmPlayer(): Promise<void>;
|
|
348
|
+
updateParams(options: TtsUpdateOptions): Promise<{
|
|
349
|
+
success: boolean;
|
|
350
|
+
detectedModels: Array<{ type: string; modelDir: string }>;
|
|
351
|
+
}>;
|
|
352
|
+
getModelInfo(): Promise<TTSModelInfo>;
|
|
353
|
+
getSampleRate(): Promise<number>;
|
|
354
|
+
getNumSpeakers(): Promise<number>;
|
|
355
|
+
destroy(): Promise<void>;
|
|
356
|
+
}
|
|
357
|
+
|
|
358
|
+
/**
|
|
359
|
+
* Information about TTS model capabilities.
|
|
360
|
+
*/
|
|
361
|
+
export interface TTSModelInfo {
|
|
362
|
+
/**
|
|
363
|
+
* Sample rate that the model generates audio at.
|
|
364
|
+
*/
|
|
365
|
+
sampleRate: number;
|
|
366
|
+
|
|
367
|
+
/**
|
|
368
|
+
* Number of speakers/voices available in the model.
|
|
369
|
+
* - 0 or 1: Single-speaker model
|
|
370
|
+
* - >1: Multi-speaker model
|
|
371
|
+
*/
|
|
372
|
+
numSpeakers: number;
|
|
373
|
+
}
|
package/src/types.ts
CHANGED
|
@@ -27,47 +27,3 @@ export type ModelPathConfig =
|
|
|
27
27
|
type: 'auto';
|
|
28
28
|
path: string;
|
|
29
29
|
};
|
|
30
|
-
|
|
31
|
-
/**
|
|
32
|
-
* Model type for explicit model detection
|
|
33
|
-
*/
|
|
34
|
-
export type ModelType =
|
|
35
|
-
| 'transducer'
|
|
36
|
-
| 'paraformer'
|
|
37
|
-
| 'nemo_ctc'
|
|
38
|
-
| 'whisper'
|
|
39
|
-
| 'wenet_ctc'
|
|
40
|
-
| 'sense_voice'
|
|
41
|
-
| 'funasr_nano'
|
|
42
|
-
| 'auto';
|
|
43
|
-
|
|
44
|
-
/**
|
|
45
|
-
* Model initialization options
|
|
46
|
-
*/
|
|
47
|
-
export interface InitializeOptions {
|
|
48
|
-
/**
|
|
49
|
-
* Model directory path configuration
|
|
50
|
-
*/
|
|
51
|
-
modelPath: ModelPathConfig | string; // string for backward compatibility
|
|
52
|
-
|
|
53
|
-
/**
|
|
54
|
-
* Model quantization preference
|
|
55
|
-
* - true: Prefer int8 quantized models (model.int8.onnx) - smaller, faster
|
|
56
|
-
* - false: Prefer regular models (model.onnx) - higher accuracy
|
|
57
|
-
* - undefined: Try int8 first, then fall back to regular (default behavior)
|
|
58
|
-
*/
|
|
59
|
-
preferInt8?: boolean;
|
|
60
|
-
|
|
61
|
-
/**
|
|
62
|
-
* Explicit model type specification
|
|
63
|
-
* - 'transducer': Force detection as Zipformer/Transducer model
|
|
64
|
-
* - 'paraformer': Force detection as Paraformer model
|
|
65
|
-
* - 'nemo_ctc': Force detection as NeMo CTC model
|
|
66
|
-
* - 'whisper': Force detection as Whisper model
|
|
67
|
-
* - 'wenet_ctc': Force detection as WeNet CTC model
|
|
68
|
-
* - 'sense_voice': Force detection as SenseVoice model
|
|
69
|
-
* - 'funasr_nano': Force detection as FunASR Nano model
|
|
70
|
-
* - 'auto': Automatic detection based on files (default)
|
|
71
|
-
*/
|
|
72
|
-
modelType?: ModelType;
|
|
73
|
-
}
|