react-native-sherpa-onnx 0.2.0 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (175) hide show
  1. package/README.md +232 -236
  2. package/SherpaOnnx.podspec +68 -64
  3. package/android/build.gradle +182 -192
  4. package/android/codegen.gradle +57 -0
  5. package/android/prebuilt-download.gradle +428 -0
  6. package/android/prebuilt-versions.gradle +43 -0
  7. package/android/proguard-rules.pro +10 -0
  8. package/android/src/main/assets/testModels/add_mul_add.onnx +28 -0
  9. package/android/src/main/assets/testModels/nnapi_internal_uint8_support.onnx +0 -0
  10. package/android/src/main/assets/testModels/qnn_multi_ctx_embed.onnx +0 -0
  11. package/android/src/main/cpp/CMakeLists.txt +166 -129
  12. package/android/src/main/cpp/CMakePresets.json +54 -0
  13. package/android/src/main/cpp/crypto/sha256.cpp +174 -0
  14. package/android/src/main/cpp/crypto/sha256.h +16 -0
  15. package/android/src/main/cpp/jni/archive/sherpa-onnx-archive-helper.cpp +404 -0
  16. package/android/src/main/cpp/jni/archive/sherpa-onnx-archive-helper.h +56 -0
  17. package/android/src/main/cpp/jni/archive/sherpa-onnx-archive-jni.cpp +181 -0
  18. package/android/src/main/cpp/jni/audio/sherpa-onnx-audio-convert-jni.cpp +888 -0
  19. package/{ios → android/src/main/cpp/jni/model_detect}/sherpa-onnx-common.h +18 -18
  20. package/android/src/main/cpp/jni/model_detect/sherpa-onnx-detect-jni-common.cpp +86 -0
  21. package/android/src/main/cpp/jni/model_detect/sherpa-onnx-detect-jni-common.h +20 -0
  22. package/android/src/main/cpp/jni/model_detect/sherpa-onnx-model-detect-helper.cpp +423 -0
  23. package/android/src/main/cpp/jni/model_detect/sherpa-onnx-model-detect-helper.h +55 -0
  24. package/android/src/main/cpp/jni/model_detect/sherpa-onnx-model-detect-stt.cpp +399 -0
  25. package/android/src/main/cpp/jni/model_detect/sherpa-onnx-model-detect-tts.cpp +238 -0
  26. package/{ios → android/src/main/cpp/jni/model_detect}/sherpa-onnx-model-detect.h +122 -89
  27. package/android/src/main/cpp/jni/model_detect/sherpa-onnx-stt-wrapper.cpp +99 -0
  28. package/android/src/main/cpp/jni/model_detect/sherpa-onnx-stt-wrapper.h +16 -0
  29. package/android/src/main/cpp/jni/model_detect/sherpa-onnx-tts-wrapper.cpp +78 -0
  30. package/android/src/main/cpp/jni/model_detect/sherpa-onnx-tts-wrapper.h +16 -0
  31. package/android/src/main/cpp/jni/module/sherpa-onnx-module-jni.cpp +190 -0
  32. package/android/src/main/cpp/jni/tts/sherpa-onnx-tts-zipvoice-jni.cpp +301 -0
  33. package/android/src/main/java/com/sherpaonnx/SherpaOnnxArchiveHelper.kt +94 -0
  34. package/android/src/main/java/com/sherpaonnx/{SherpaOnnxCoreHelper.kt → SherpaOnnxAssetHelper.kt} +350 -236
  35. package/android/src/main/java/com/sherpaonnx/SherpaOnnxModule.kt +791 -483
  36. package/android/src/main/java/com/sherpaonnx/SherpaOnnxSttHelper.kt +699 -109
  37. package/android/src/main/java/com/sherpaonnx/SherpaOnnxTtsHelper.kt +1123 -668
  38. package/android/src/main/java/com/sherpaonnx/ZipvoiceTtsWrapper.kt +187 -0
  39. package/ios/SherpaOnnx+Assets.h +11 -0
  40. package/ios/SherpaOnnx+Assets.mm +325 -0
  41. package/ios/SherpaOnnx+STT.mm +455 -118
  42. package/ios/SherpaOnnx+TTS.mm +1101 -712
  43. package/ios/SherpaOnnx.h +17 -6
  44. package/ios/SherpaOnnx.mm +206 -311
  45. package/ios/SherpaOnnx.xcconfig +19 -19
  46. package/ios/SherpaOnnxCoreMLHelper.swift +24 -0
  47. package/ios/archive/sherpa-onnx-archive-helper.h +21 -0
  48. package/ios/archive/sherpa-onnx-archive-helper.mm +296 -0
  49. package/ios/libarchive_darwin_config.h +153 -0
  50. package/{android/src/main/cpp/jni → ios/model_detect}/sherpa-onnx-common.h +18 -18
  51. package/ios/model_detect/sherpa-onnx-model-detect-helper.h +49 -0
  52. package/ios/model_detect/sherpa-onnx-model-detect-helper.mm +210 -0
  53. package/ios/model_detect/sherpa-onnx-model-detect-stt.mm +344 -0
  54. package/ios/model_detect/sherpa-onnx-model-detect-tts.mm +201 -0
  55. package/{android/src/main/cpp/jni → ios/model_detect}/sherpa-onnx-model-detect.h +117 -89
  56. package/ios/scripts/patch-libarchive-includes.sh +61 -0
  57. package/ios/scripts/setup-ios-libarchive.sh +98 -0
  58. package/ios/stt/sherpa-onnx-stt-wrapper.h +129 -0
  59. package/ios/stt/sherpa-onnx-stt-wrapper.mm +523 -0
  60. package/ios/{sherpa-onnx-tts-wrapper.h → tts/sherpa-onnx-tts-wrapper.h} +90 -85
  61. package/ios/{sherpa-onnx-tts-wrapper.mm → tts/sherpa-onnx-tts-wrapper.mm} +376 -345
  62. package/lib/module/NativeSherpaOnnx.js +3 -0
  63. package/lib/module/NativeSherpaOnnx.js.map +1 -1
  64. package/lib/module/audio/index.js +22 -0
  65. package/lib/module/audio/index.js.map +1 -0
  66. package/lib/module/diarization/index.js +1 -1
  67. package/lib/module/diarization/index.js.map +1 -1
  68. package/lib/module/download/ModelDownloadManager.js +918 -0
  69. package/lib/module/download/ModelDownloadManager.js.map +1 -0
  70. package/lib/module/download/extractTarBz2.js +53 -0
  71. package/lib/module/download/extractTarBz2.js.map +1 -0
  72. package/lib/module/download/index.js +6 -0
  73. package/lib/module/download/index.js.map +1 -0
  74. package/lib/module/download/validation.js +178 -0
  75. package/lib/module/download/validation.js.map +1 -0
  76. package/lib/module/enhancement/index.js +1 -1
  77. package/lib/module/enhancement/index.js.map +1 -1
  78. package/lib/module/index.js +41 -3
  79. package/lib/module/index.js.map +1 -1
  80. package/lib/module/separation/index.js +1 -1
  81. package/lib/module/separation/index.js.map +1 -1
  82. package/lib/module/stt/index.js +127 -60
  83. package/lib/module/stt/index.js.map +1 -1
  84. package/lib/module/stt/sttModelLanguages.js +512 -0
  85. package/lib/module/stt/sttModelLanguages.js.map +1 -0
  86. package/lib/module/stt/types.js +53 -1
  87. package/lib/module/stt/types.js.map +1 -1
  88. package/lib/module/tts/index.js +216 -289
  89. package/lib/module/tts/index.js.map +1 -1
  90. package/lib/module/tts/types.js +86 -1
  91. package/lib/module/tts/types.js.map +1 -1
  92. package/lib/module/types.js.map +1 -1
  93. package/lib/module/utils.js +86 -73
  94. package/lib/module/utils.js.map +1 -1
  95. package/lib/module/vad/index.js +1 -1
  96. package/lib/module/vad/index.js.map +1 -1
  97. package/lib/typescript/src/NativeSherpaOnnx.d.ts +192 -38
  98. package/lib/typescript/src/NativeSherpaOnnx.d.ts.map +1 -1
  99. package/lib/typescript/src/audio/index.d.ts +13 -0
  100. package/lib/typescript/src/audio/index.d.ts.map +1 -0
  101. package/lib/typescript/src/diarization/index.d.ts +3 -2
  102. package/lib/typescript/src/diarization/index.d.ts.map +1 -1
  103. package/lib/typescript/src/download/ModelDownloadManager.d.ts +108 -0
  104. package/lib/typescript/src/download/ModelDownloadManager.d.ts.map +1 -0
  105. package/lib/typescript/src/download/extractTarBz2.d.ts +14 -0
  106. package/lib/typescript/src/download/extractTarBz2.d.ts.map +1 -0
  107. package/lib/typescript/src/download/index.d.ts +7 -0
  108. package/lib/typescript/src/download/index.d.ts.map +1 -0
  109. package/lib/typescript/src/download/validation.d.ts +57 -0
  110. package/lib/typescript/src/download/validation.d.ts.map +1 -0
  111. package/lib/typescript/src/enhancement/index.d.ts +3 -2
  112. package/lib/typescript/src/enhancement/index.d.ts.map +1 -1
  113. package/lib/typescript/src/index.d.ts +26 -2
  114. package/lib/typescript/src/index.d.ts.map +1 -1
  115. package/lib/typescript/src/separation/index.d.ts +3 -2
  116. package/lib/typescript/src/separation/index.d.ts.map +1 -1
  117. package/lib/typescript/src/stt/index.d.ts +31 -43
  118. package/lib/typescript/src/stt/index.d.ts.map +1 -1
  119. package/lib/typescript/src/stt/sttModelLanguages.d.ts +52 -0
  120. package/lib/typescript/src/stt/sttModelLanguages.d.ts.map +1 -0
  121. package/lib/typescript/src/stt/types.d.ts +196 -9
  122. package/lib/typescript/src/stt/types.d.ts.map +1 -1
  123. package/lib/typescript/src/tts/index.d.ts +25 -211
  124. package/lib/typescript/src/tts/index.d.ts.map +1 -1
  125. package/lib/typescript/src/tts/types.d.ts +148 -25
  126. package/lib/typescript/src/tts/types.d.ts.map +1 -1
  127. package/lib/typescript/src/types.d.ts +0 -32
  128. package/lib/typescript/src/types.d.ts.map +1 -1
  129. package/lib/typescript/src/utils.d.ts +28 -13
  130. package/lib/typescript/src/utils.d.ts.map +1 -1
  131. package/lib/typescript/src/vad/index.d.ts +3 -2
  132. package/lib/typescript/src/vad/index.d.ts.map +1 -1
  133. package/package.json +250 -222
  134. package/scripts/check-qnn-support.sh +78 -0
  135. package/scripts/setup-ios-framework.sh +379 -282
  136. package/src/NativeSherpaOnnx.ts +474 -251
  137. package/src/audio/index.ts +32 -0
  138. package/src/diarization/index.ts +4 -2
  139. package/src/download/ModelDownloadManager.ts +1325 -0
  140. package/src/download/extractTarBz2.ts +78 -0
  141. package/src/download/index.ts +43 -0
  142. package/src/download/validation.ts +279 -0
  143. package/src/enhancement/index.ts +4 -2
  144. package/src/index.tsx +78 -27
  145. package/src/separation/index.ts +4 -2
  146. package/src/stt/index.ts +249 -89
  147. package/src/stt/sttModelLanguages.ts +237 -0
  148. package/src/stt/types.ts +263 -9
  149. package/src/tts/index.ts +470 -458
  150. package/src/tts/types.ts +373 -218
  151. package/src/types.ts +0 -44
  152. package/src/utils.ts +145 -131
  153. package/src/vad/index.ts +4 -2
  154. package/third_party/ffmpeg_prebuilt/ANDROID_RELEASE_TAG +1 -0
  155. package/third_party/libarchive_prebuilt/ANDROID_RELEASE_TAG +1 -0
  156. package/third_party/libarchive_prebuilt/IOS_RELEASE_TAG +1 -0
  157. package/third_party/sherpa-onnx-prebuilt/ANDROID_RELEASE_TAG +1 -0
  158. package/third_party/sherpa-onnx-prebuilt/IOS_RELEASE_TAG +1 -0
  159. package/android/src/main/cpp/include/sherpa-onnx/c-api/c-api.h +0 -1918
  160. package/android/src/main/cpp/include/sherpa-onnx/c-api/cxx-api.h +0 -841
  161. package/android/src/main/cpp/jni/sherpa-onnx-model-detect.cpp +0 -541
  162. package/android/src/main/cpp/jni/sherpa-onnx-stt-jni.cpp +0 -336
  163. package/android/src/main/cpp/jni/sherpa-onnx-stt-wrapper.cpp +0 -222
  164. package/android/src/main/cpp/jni/sherpa-onnx-stt-wrapper.h +0 -68
  165. package/android/src/main/cpp/jni/sherpa-onnx-tts-jni.cpp +0 -823
  166. package/android/src/main/cpp/jni/sherpa-onnx-tts-wrapper.cpp +0 -387
  167. package/android/src/main/cpp/jni/sherpa-onnx-tts-wrapper.h +0 -147
  168. package/ios/Frameworks/sherpa_onnx.xcframework.zip +0 -0
  169. package/ios/include/sherpa-onnx/c-api/c-api.h +0 -1918
  170. package/ios/include/sherpa-onnx/c-api/cxx-api.h +0 -841
  171. package/ios/sherpa-onnx-model-detect.mm +0 -441
  172. package/ios/sherpa-onnx-stt-wrapper.h +0 -48
  173. package/ios/sherpa-onnx-stt-wrapper.mm +0 -201
  174. package/scripts/copy-headers.js +0 -184
  175. package/scripts/setup-assets.js +0 -323
package/src/tts/types.ts CHANGED
@@ -1,218 +1,373 @@
1
- import type { ModelPathConfig } from '../types';
2
-
3
- /**
4
- * Supported TTS model types.
5
- *
6
- * - 'vits': VITS models (includes Piper, Coqui, MeloTTS, MMS variants)
7
- * - 'matcha': Matcha models (acoustic model + vocoder)
8
- * - 'kokoro': Kokoro models (multi-speaker, multi-language)
9
- * - 'kitten': KittenTTS models (lightweight, multi-speaker)
10
- * - 'zipvoice': Zipvoice models (voice cloning capable)
11
- * - 'auto': Auto-detect model type based on files present (default)
12
- */
13
- export type TTSModelType =
14
- | 'vits'
15
- | 'matcha'
16
- | 'kokoro'
17
- | 'kitten'
18
- | 'zipvoice'
19
- | 'auto';
20
-
21
- /**
22
- * Configuration for TTS initialization.
23
- */
24
- export interface TTSInitializeOptions {
25
- /**
26
- * Path to the model directory.
27
- * Can be an asset path, file system path, or auto-detection path.
28
- */
29
- modelPath: ModelPathConfig | string;
30
-
31
- /**
32
- * Model type to use.
33
- * If not specified or 'auto', the model type will be auto-detected
34
- * based on the files present in the model directory.
35
- *
36
- * @default 'auto'
37
- */
38
- modelType?: TTSModelType;
39
-
40
- /**
41
- * Number of threads to use for inference.
42
- * More threads = faster processing but more CPU usage.
43
- *
44
- * @default 2
45
- */
46
- numThreads?: number;
47
-
48
- /**
49
- * Enable debug logging from the TTS engine.
50
- *
51
- * @default false
52
- */
53
- debug?: boolean;
54
-
55
- /**
56
- * Noise scale for VITS/Matcha models.
57
- *
58
- * If omitted, the model default (or model.json) is used.
59
- */
60
- noiseScale?: number;
61
-
62
- /**
63
- * Noise scale W for VITS models.
64
- *
65
- * If omitted, the model default (or model.json) is used.
66
- */
67
- noiseScaleW?: number;
68
-
69
- /**
70
- * Length scale for VITS/Matcha/Kokoro/Kitten models.
71
- *
72
- * If omitted, the model default (or model.json) is used.
73
- */
74
- lengthScale?: number;
75
- }
76
-
77
- /**
78
- * Options for updating TTS model parameters.
79
- */
80
- export interface TtsUpdateOptions {
81
- /**
82
- * Noise scale for VITS/Matcha models.
83
- */
84
- noiseScale?: number | null;
85
-
86
- /**
87
- * Noise scale W for VITS models.
88
- */
89
- noiseScaleW?: number | null;
90
-
91
- /**
92
- * Length scale for VITS/Matcha/Kokoro/Kitten models.
93
- */
94
- lengthScale?: number | null;
95
- }
96
-
97
- /**
98
- * Options for speech synthesis.
99
- */
100
- export interface SynthesisOptions {
101
- /**
102
- * Speaker ID for multi-speaker models.
103
- * For single-speaker models, this is ignored.
104
- *
105
- * Use `getNumSpeakers()` to check how many speakers are available.
106
- *
107
- * @default 0
108
- */
109
- sid?: number;
110
-
111
- /**
112
- * Speech speed multiplier.
113
- *
114
- * - 1.0 = normal speed
115
- * - 0.5 = half speed (slower)
116
- * - 2.0 = double speed (faster)
117
- *
118
- * @default 1.0
119
- */
120
- speed?: number;
121
- }
122
-
123
- /**
124
- * Generated audio data from TTS synthesis.
125
- *
126
- * The samples are normalized float values in the range [-1.0, 1.0].
127
- * To save as a WAV file or play the audio, you'll need to convert
128
- * these samples to the appropriate format for your use case.
129
- */
130
- export interface GeneratedAudio {
131
- /**
132
- * Audio samples as an array of float values in range [-1.0, 1.0].
133
- * This is raw PCM audio data.
134
- */
135
- samples: number[];
136
-
137
- /**
138
- * Sample rate of the generated audio in Hz.
139
- * Common values: 16000, 22050, 44100, 48000
140
- */
141
- sampleRate: number;
142
- }
143
-
144
- /**
145
- * Subtitle/timestamp item for synthesized speech.
146
- */
147
- export interface TtsSubtitleItem {
148
- /**
149
- * Text token for this time range.
150
- */
151
- text: string;
152
-
153
- /**
154
- * Start time in seconds.
155
- */
156
- start: number;
157
-
158
- /**
159
- * End time in seconds.
160
- */
161
- end: number;
162
- }
163
-
164
- /**
165
- * Generated audio with subtitle/timestamp metadata.
166
- */
167
- export interface GeneratedAudioWithTimestamps extends GeneratedAudio {
168
- /**
169
- * Subtitle/timestamp entries.
170
- */
171
- subtitles: TtsSubtitleItem[];
172
-
173
- /**
174
- * True if timestamps are estimated rather than model-provided.
175
- */
176
- estimated: boolean;
177
- }
178
-
179
- /**
180
- * Streaming chunk event payload for TTS generation.
181
- */
182
- export interface TtsStreamChunk {
183
- samples: number[];
184
- sampleRate: number;
185
- progress: number;
186
- isFinal: boolean;
187
- }
188
-
189
- /**
190
- * Streaming end event payload.
191
- */
192
- export interface TtsStreamEnd {
193
- cancelled: boolean;
194
- }
195
-
196
- /**
197
- * Streaming error event payload.
198
- */
199
- export interface TtsStreamError {
200
- message: string;
201
- }
202
-
203
- /**
204
- * Information about TTS model capabilities.
205
- */
206
- export interface TTSModelInfo {
207
- /**
208
- * Sample rate that the model generates audio at.
209
- */
210
- sampleRate: number;
211
-
212
- /**
213
- * Number of speakers/voices available in the model.
214
- * - 0 or 1: Single-speaker model
215
- * - >1: Multi-speaker model
216
- */
217
- numSpeakers: number;
218
- }
1
+ import type { ModelPathConfig } from '../types';
2
+
3
+ /**
4
+ * Supported TTS model types.
5
+ *
6
+ * - 'vits': VITS models (includes Piper, Coqui, MeloTTS, MMS variants)
7
+ * - 'matcha': Matcha models (acoustic model + vocoder)
8
+ * - 'kokoro': Kokoro models (multi-speaker, multi-language)
9
+ * - 'kitten': KittenTTS models (lightweight, multi-speaker)
10
+ * - 'pocket': Pocket TTS models
11
+ * - 'zipvoice': Zipvoice models (voice cloning capable)
12
+ * - 'auto': Auto-detect model type based on files present (default)
13
+ */
14
+ export type TTSModelType =
15
+ | 'vits'
16
+ | 'matcha'
17
+ | 'kokoro'
18
+ | 'kitten'
19
+ | 'pocket'
20
+ | 'zipvoice'
21
+ | 'auto';
22
+
23
+ /** Runtime list of supported TTS model types. */
24
+ export const TTS_MODEL_TYPES: readonly TTSModelType[] = [
25
+ 'vits',
26
+ 'matcha',
27
+ 'kokoro',
28
+ 'kitten',
29
+ 'pocket',
30
+ 'zipvoice',
31
+ 'auto',
32
+ ] as const;
33
+
34
+ // ========== Model-specific options (only applied when that model type is loaded) ==========
35
+
36
+ /** Options for VITS models. Applied only when modelType is 'vits'. Kotlin OfflineTtsVitsModelConfig. */
37
+ export interface TtsVitsModelOptions {
38
+ /** Noise scale. If omitted, model default (or model.json) is used. */
39
+ noiseScale?: number;
40
+ /** Noise scale W. If omitted, model default is used. */
41
+ noiseScaleW?: number;
42
+ /** Length scale. If omitted, model default is used. */
43
+ lengthScale?: number;
44
+ }
45
+
46
+ /** Options for Matcha models. Applied only when modelType is 'matcha'. Kotlin OfflineTtsMatchaModelConfig. */
47
+ export interface TtsMatchaModelOptions {
48
+ /** Noise scale. If omitted, model default is used. */
49
+ noiseScale?: number;
50
+ /** Length scale. If omitted, model default is used. */
51
+ lengthScale?: number;
52
+ }
53
+
54
+ /** Options for Kokoro models. Applied only when modelType is 'kokoro'. Kotlin OfflineTtsKokoroModelConfig. */
55
+ export interface TtsKokoroModelOptions {
56
+ /** Length scale. If omitted, model default is used. */
57
+ lengthScale?: number;
58
+ }
59
+
60
+ /** Options for KittenTTS models. Applied only when modelType is 'kitten'. Kotlin OfflineTtsKittenModelConfig. */
61
+ export interface TtsKittenModelOptions {
62
+ /** Length scale. If omitted, model default is used. */
63
+ lengthScale?: number;
64
+ }
65
+
66
+ /** Options for Pocket TTS models. Applied only when modelType is 'pocket'. Kotlin has no init-time model config for pocket; reserved for future use. */
67
+ export interface TtsPocketModelOptions {
68
+ // No init-time options in Kotlin OfflineTtsPocketModelConfig; voice cloning is via GenerationConfig at generate time.
69
+ }
70
+
71
+ /**
72
+ * Model-specific TTS options. Only the block for the actually loaded model type is applied;
73
+ * others are ignored (e.g. vits options have no effect when a kokoro model is loaded).
74
+ */
75
+ export interface TtsModelOptions {
76
+ vits?: TtsVitsModelOptions;
77
+ matcha?: TtsMatchaModelOptions;
78
+ kokoro?: TtsKokoroModelOptions;
79
+ kitten?: TtsKittenModelOptions;
80
+ pocket?: TtsPocketModelOptions;
81
+ }
82
+
83
+ /**
84
+ * Configuration for TTS initialization.
85
+ */
86
+ export interface TTSInitializeOptions {
87
+ /**
88
+ * Path to the model directory.
89
+ * Can be an asset path, file system path, or auto-detection path.
90
+ */
91
+ modelPath: ModelPathConfig;
92
+
93
+ /**
94
+ * Model type to use.
95
+ * If not specified or 'auto', the model type will be auto-detected
96
+ * based on the files present in the model directory.
97
+ *
98
+ * @default 'auto'
99
+ */
100
+ modelType?: TTSModelType;
101
+
102
+ /**
103
+ * Execution provider (e.g. `'cpu'`, `'coreml'`, `'xnnpack'`, `'nnapi'`, `'qnn'`).
104
+ * Use getCoreMlSupport(), getXnnpackSupport(), etc. to check availability. See execution-providers.md.
105
+ *
106
+ * @default 'cpu'
107
+ */
108
+ provider?: string;
109
+
110
+ /**
111
+ * Number of threads to use for inference.
112
+ * More threads = faster processing but more CPU usage.
113
+ *
114
+ * @default 2
115
+ */
116
+ numThreads?: number;
117
+
118
+ /**
119
+ * Enable debug logging from the TTS engine.
120
+ *
121
+ * @default false
122
+ */
123
+ debug?: boolean;
124
+
125
+ /**
126
+ * Model-specific options. Only options for the loaded model type are applied.
127
+ * E.g. when modelType is 'vits', only modelOptions.vits is used.
128
+ */
129
+ modelOptions?: TtsModelOptions;
130
+
131
+ /**
132
+ * Path(s) to rule FSTs for TTS (OfflineTtsConfig.ruleFsts).
133
+ * Used for text normalization / ITN.
134
+ */
135
+ ruleFsts?: string;
136
+
137
+ /**
138
+ * Path(s) to rule FARs for TTS (OfflineTtsConfig.ruleFars).
139
+ * Used for text normalization / ITN.
140
+ */
141
+ ruleFars?: string;
142
+
143
+ /**
144
+ * Max number of sentences per streaming callback (OfflineTtsConfig.maxNumSentences).
145
+ * Default: 1.
146
+ */
147
+ maxNumSentences?: number;
148
+
149
+ /**
150
+ * Silence scale on config level (OfflineTtsConfig.silenceScale).
151
+ * Default: 0.2.
152
+ */
153
+ silenceScale?: number;
154
+ }
155
+
156
+ /**
157
+ * Options for updating TTS model parameters at runtime.
158
+ * Only the block for the given modelType is applied; flattened to native noiseScale / noiseScaleW / lengthScale.
159
+ */
160
+ export interface TtsUpdateOptions {
161
+ /**
162
+ * Model type currently loaded. When omitted or 'auto', the SDK uses the model type from the last
163
+ * successful initializeTTS(). After unloadTTS(), pass modelType explicitly until init is called again.
164
+ */
165
+ modelType?: TTSModelType;
166
+
167
+ /**
168
+ * Model-specific options. Only the block for the effective model type is used (e.g. modelOptions.vits when type is 'vits').
169
+ */
170
+ modelOptions?: TtsModelOptions;
171
+ }
172
+
173
+ /**
174
+ * Options for TTS generation. Maps to Kotlin GenerationConfig when reference
175
+ * audio or advanced options are used; otherwise simple sid/speed are used.
176
+ */
177
+ export interface TtsGenerationOptions {
178
+ /**
179
+ * Speaker ID for multi-speaker models.
180
+ * For single-speaker models, this is ignored.
181
+ *
182
+ * Use `getNumSpeakers()` to check how many speakers are available.
183
+ *
184
+ * @default 0
185
+ */
186
+ sid?: number;
187
+
188
+ /**
189
+ * Speech speed multiplier.
190
+ *
191
+ * - 1.0 = normal speed
192
+ * - 0.5 = half speed (slower)
193
+ * - 2.0 = double speed (faster)
194
+ *
195
+ * @default 1.0
196
+ */
197
+ speed?: number;
198
+
199
+ /**
200
+ * Silence scale (Kotlin GenerationConfig.silenceScale). Used at generate time.
201
+ */
202
+ silenceScale?: number;
203
+
204
+ /**
205
+ * Reference audio for voice cloning (Kotlin GenerationConfig).
206
+ * In the Kotlin/RN stack, only Pocket TTS uses this; other model types (vits, matcha, kokoro, kitten) ignore it.
207
+ * Mono float samples in [-1, 1] and sample rate in Hz.
208
+ */
209
+ referenceAudio?: { samples: number[]; sampleRate: number };
210
+
211
+ /**
212
+ * Transcript text of the reference audio (Kotlin GenerationConfig.referenceText).
213
+ * Required for Pocket TTS when referenceAudio is provided; ignored by other model types.
214
+ */
215
+ referenceText?: string;
216
+
217
+ /**
218
+ * Number of steps, e.g. flow-matching steps (Kotlin GenerationConfig.numSteps).
219
+ * Used by models such as Pocket.
220
+ */
221
+ numSteps?: number;
222
+
223
+ /**
224
+ * Extra options as key-value pairs (Kotlin GenerationConfig.extra).
225
+ * Model-specific (e.g. temperature, chunk_size for Pocket).
226
+ */
227
+ extra?: Record<string, string>;
228
+ }
229
+
230
+ /**
231
+ * Generated audio data from TTS synthesis.
232
+ *
233
+ * The samples are normalized float values in the range [-1.0, 1.0].
234
+ * To save as a WAV file or play the audio, you'll need to convert
235
+ * these samples to the appropriate format for your use case.
236
+ */
237
+ export interface GeneratedAudio {
238
+ /**
239
+ * Audio samples as an array of float values in range [-1.0, 1.0].
240
+ * This is raw PCM audio data.
241
+ */
242
+ samples: number[];
243
+
244
+ /**
245
+ * Sample rate of the generated audio in Hz.
246
+ * Common values: 16000, 22050, 44100, 48000
247
+ */
248
+ sampleRate: number;
249
+ }
250
+
251
+ /**
252
+ * Subtitle/timestamp item for synthesized speech.
253
+ */
254
+ export interface TtsSubtitleItem {
255
+ /**
256
+ * Text token for this time range.
257
+ */
258
+ text: string;
259
+
260
+ /**
261
+ * Start time in seconds.
262
+ */
263
+ start: number;
264
+
265
+ /**
266
+ * End time in seconds.
267
+ */
268
+ end: number;
269
+ }
270
+
271
+ /**
272
+ * Generated audio with subtitle/timestamp metadata.
273
+ */
274
+ export interface GeneratedAudioWithTimestamps extends GeneratedAudio {
275
+ /**
276
+ * Subtitle/timestamp entries.
277
+ */
278
+ subtitles: TtsSubtitleItem[];
279
+
280
+ /**
281
+ * True if timestamps are estimated rather than model-provided.
282
+ */
283
+ estimated: boolean;
284
+ }
285
+
286
+ /**
287
+ * Streaming chunk event payload for TTS generation.
288
+ */
289
+ export interface TtsStreamChunk {
290
+ /** Instance ID (set by native for multi-instance routing). */
291
+ instanceId?: string;
292
+ samples: number[];
293
+ sampleRate: number;
294
+ progress: number;
295
+ isFinal: boolean;
296
+ }
297
+
298
+ /**
299
+ * Streaming end event payload.
300
+ */
301
+ export interface TtsStreamEnd {
302
+ /** Instance ID (set by native for multi-instance routing). */
303
+ instanceId?: string;
304
+ cancelled: boolean;
305
+ }
306
+
307
+ /**
308
+ * Streaming error event payload.
309
+ */
310
+ export interface TtsStreamError {
311
+ /** Instance ID (set by native for multi-instance routing). */
312
+ instanceId?: string;
313
+ message: string;
314
+ }
315
+
316
+ /**
317
+ * Handlers for TTS streaming generation (chunk, end, error).
318
+ */
319
+ export interface TtsStreamHandlers {
320
+ onChunk?: (chunk: TtsStreamChunk) => void;
321
+ onEnd?: (event: TtsStreamEnd) => void;
322
+ onError?: (event: TtsStreamError) => void;
323
+ }
324
+
325
+ /**
326
+ * Instance-based TTS engine returned by createTTS().
327
+ * Call destroy() when done to free native resources.
328
+ */
329
+ export interface TtsEngine {
330
+ readonly instanceId: string;
331
+ generateSpeech(
332
+ text: string,
333
+ options?: TtsGenerationOptions
334
+ ): Promise<GeneratedAudio>;
335
+ generateSpeechWithTimestamps(
336
+ text: string,
337
+ options?: TtsGenerationOptions
338
+ ): Promise<GeneratedAudioWithTimestamps>;
339
+ generateSpeechStream(
340
+ text: string,
341
+ options: TtsGenerationOptions | undefined,
342
+ handlers: TtsStreamHandlers
343
+ ): Promise<() => void>;
344
+ cancelSpeechStream(): Promise<void>;
345
+ startPcmPlayer(sampleRate: number, channels: number): Promise<void>;
346
+ writePcmChunk(samples: number[]): Promise<void>;
347
+ stopPcmPlayer(): Promise<void>;
348
+ updateParams(options: TtsUpdateOptions): Promise<{
349
+ success: boolean;
350
+ detectedModels: Array<{ type: string; modelDir: string }>;
351
+ }>;
352
+ getModelInfo(): Promise<TTSModelInfo>;
353
+ getSampleRate(): Promise<number>;
354
+ getNumSpeakers(): Promise<number>;
355
+ destroy(): Promise<void>;
356
+ }
357
+
358
+ /**
359
+ * Information about TTS model capabilities.
360
+ */
361
+ export interface TTSModelInfo {
362
+ /**
363
+ * Sample rate that the model generates audio at.
364
+ */
365
+ sampleRate: number;
366
+
367
+ /**
368
+ * Number of speakers/voices available in the model.
369
+ * - 0 or 1: Single-speaker model
370
+ * - >1: Multi-speaker model
371
+ */
372
+ numSpeakers: number;
373
+ }
package/src/types.ts CHANGED
@@ -27,47 +27,3 @@ export type ModelPathConfig =
27
27
  type: 'auto';
28
28
  path: string;
29
29
  };
30
-
31
- /**
32
- * Model type for explicit model detection
33
- */
34
- export type ModelType =
35
- | 'transducer'
36
- | 'paraformer'
37
- | 'nemo_ctc'
38
- | 'whisper'
39
- | 'wenet_ctc'
40
- | 'sense_voice'
41
- | 'funasr_nano'
42
- | 'auto';
43
-
44
- /**
45
- * Model initialization options
46
- */
47
- export interface InitializeOptions {
48
- /**
49
- * Model directory path configuration
50
- */
51
- modelPath: ModelPathConfig | string; // string for backward compatibility
52
-
53
- /**
54
- * Model quantization preference
55
- * - true: Prefer int8 quantized models (model.int8.onnx) - smaller, faster
56
- * - false: Prefer regular models (model.onnx) - higher accuracy
57
- * - undefined: Try int8 first, then fall back to regular (default behavior)
58
- */
59
- preferInt8?: boolean;
60
-
61
- /**
62
- * Explicit model type specification
63
- * - 'transducer': Force detection as Zipformer/Transducer model
64
- * - 'paraformer': Force detection as Paraformer model
65
- * - 'nemo_ctc': Force detection as NeMo CTC model
66
- * - 'whisper': Force detection as Whisper model
67
- * - 'wenet_ctc': Force detection as WeNet CTC model
68
- * - 'sense_voice': Force detection as SenseVoice model
69
- * - 'funasr_nano': Force detection as FunASR Nano model
70
- * - 'auto': Automatic detection based on files (default)
71
- */
72
- modelType?: ModelType;
73
- }