react-native-sherpa-onnx 0.3.7 → 0.3.9
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +7 -2
- package/SherpaOnnx.podspec +4 -1
- package/android/prebuilt-download.gradle +23 -23
- package/android/src/main/assets/model_licenses/asr-models-license-status.csv +1 -0
- package/android/src/main/cpp/jni/model_detect/sherpa-onnx-model-detect-helper.cpp +23 -0
- package/android/src/main/cpp/jni/model_detect/sherpa-onnx-model-detect-helper.h +9 -0
- package/android/src/main/cpp/jni/model_detect/sherpa-onnx-model-detect-stt.cpp +51 -8
- package/android/src/main/cpp/jni/model_detect/sherpa-onnx-model-detect-tts.cpp +31 -4
- package/android/src/main/cpp/jni/model_detect/sherpa-onnx-model-detect.h +19 -1
- package/android/src/main/cpp/jni/model_detect/sherpa-onnx-stt-wrapper.cpp +5 -0
- package/android/src/main/cpp/jni/model_detect/sherpa-onnx-tts-wrapper.cpp +7 -0
- package/android/src/main/cpp/jni/model_detect/sherpa-onnx-validate-stt.cpp +11 -0
- package/android/src/main/cpp/jni/model_detect/sherpa-onnx-validate-tts.cpp +14 -0
- package/android/src/main/java/com/sherpaonnx/SherpaOnnxArchiveHelper.kt +110 -35
- package/android/src/main/java/com/sherpaonnx/SherpaOnnxExtractionNotificationHelper.kt +102 -0
- package/android/src/main/java/com/sherpaonnx/SherpaOnnxModule.kt +92 -18
- package/android/src/main/java/com/sherpaonnx/SherpaOnnxSttHelper.kt +22 -0
- package/android/src/main/java/com/sherpaonnx/SherpaOnnxTtsHelper.kt +15 -0
- package/ios/Resources/model_licenses/asr-models-license-status.csv +1 -0
- package/ios/SherpaOnnx+STT.mm +13 -1
- package/ios/SherpaOnnx+TTS.mm +1 -0
- package/ios/SherpaOnnx.mm +87 -17
- package/ios/model_detect/sherpa-onnx-model-detect-helper.h +5 -0
- package/ios/model_detect/sherpa-onnx-model-detect-helper.mm +23 -0
- package/ios/model_detect/sherpa-onnx-model-detect-stt.mm +51 -7
- package/ios/model_detect/sherpa-onnx-model-detect-tts.mm +36 -4
- package/ios/model_detect/sherpa-onnx-model-detect.h +19 -1
- package/ios/model_detect/sherpa-onnx-validate-stt.mm +11 -0
- package/ios/model_detect/sherpa-onnx-validate-tts.mm +14 -0
- package/ios/stt/sherpa-onnx-stt-wrapper.h +11 -1
- package/ios/stt/sherpa-onnx-stt-wrapper.mm +30 -2
- package/ios/tts/sherpa-onnx-tts-wrapper.mm +25 -0
- package/lib/module/NativeSherpaOnnx.js.map +1 -1
- package/lib/module/download/ModelDownloadManager.js +1 -1
- package/lib/module/download/ModelDownloadManager.js.map +1 -1
- package/lib/module/download/background-downloader-types.js +2 -0
- package/lib/module/download/background-downloader-types.js.map +1 -0
- package/lib/module/download/downloadTask.js +54 -1
- package/lib/module/download/downloadTask.js.map +1 -1
- package/lib/module/download/index.js +1 -1
- package/lib/module/download/index.js.map +1 -1
- package/lib/module/download/postDownloadProcessing.js +17 -4
- package/lib/module/download/postDownloadProcessing.js.map +1 -1
- package/lib/module/download/registry.js +1 -0
- package/lib/module/download/registry.js.map +1 -1
- package/lib/module/extraction/extractTarBz2.js +2 -2
- package/lib/module/extraction/extractTarBz2.js.map +1 -1
- package/lib/module/extraction/extractTarZst.js +2 -2
- package/lib/module/extraction/extractTarZst.js.map +1 -1
- package/lib/module/extraction/index.js +10 -5
- package/lib/module/extraction/index.js.map +1 -1
- package/lib/module/stt/index.js +4 -2
- package/lib/module/stt/index.js.map +1 -1
- package/lib/module/stt/streaming.js +2 -1
- package/lib/module/stt/streaming.js.map +1 -1
- package/lib/module/stt/types.js +3 -1
- package/lib/module/stt/types.js.map +1 -1
- package/lib/module/tts/index.js +5 -3
- package/lib/module/tts/index.js.map +1 -1
- package/lib/module/tts/streaming.js +4 -2
- package/lib/module/tts/streaming.js.map +1 -1
- package/lib/module/tts/types.js +4 -1
- package/lib/module/tts/types.js.map +1 -1
- package/lib/typescript/src/NativeSherpaOnnx.d.ts +26 -10
- package/lib/typescript/src/NativeSherpaOnnx.d.ts.map +1 -1
- package/lib/typescript/src/download/ModelDownloadManager.d.ts +2 -1
- package/lib/typescript/src/download/ModelDownloadManager.d.ts.map +1 -1
- package/lib/typescript/src/download/background-downloader-types.d.ts +64 -0
- package/lib/typescript/src/download/background-downloader-types.d.ts.map +1 -0
- package/lib/typescript/src/download/downloadTask.d.ts +10 -0
- package/lib/typescript/src/download/downloadTask.d.ts.map +1 -1
- package/lib/typescript/src/download/index.d.ts +2 -2
- package/lib/typescript/src/download/index.d.ts.map +1 -1
- package/lib/typescript/src/download/postDownloadProcessing.d.ts +9 -0
- package/lib/typescript/src/download/postDownloadProcessing.d.ts.map +1 -1
- package/lib/typescript/src/download/registry.d.ts.map +1 -1
- package/lib/typescript/src/extraction/extractTarBz2.d.ts +2 -1
- package/lib/typescript/src/extraction/extractTarBz2.d.ts.map +1 -1
- package/lib/typescript/src/extraction/extractTarZst.d.ts +2 -1
- package/lib/typescript/src/extraction/extractTarZst.d.ts.map +1 -1
- package/lib/typescript/src/extraction/index.d.ts +1 -1
- package/lib/typescript/src/extraction/index.d.ts.map +1 -1
- package/lib/typescript/src/extraction/types.d.ts +12 -0
- package/lib/typescript/src/extraction/types.d.ts.map +1 -1
- package/lib/typescript/src/stt/index.d.ts +1 -1
- package/lib/typescript/src/stt/index.d.ts.map +1 -1
- package/lib/typescript/src/stt/streaming.d.ts.map +1 -1
- package/lib/typescript/src/stt/types.d.ts +16 -1
- package/lib/typescript/src/stt/types.d.ts.map +1 -1
- package/lib/typescript/src/tts/index.d.ts +1 -1
- package/lib/typescript/src/tts/index.d.ts.map +1 -1
- package/lib/typescript/src/tts/streaming.d.ts.map +1 -1
- package/lib/typescript/src/tts/types.d.ts +6 -1
- package/lib/typescript/src/tts/types.d.ts.map +1 -1
- package/package.json +1 -1
- package/scripts/ci/update_model_license_csv.sh +16 -16
- package/src/NativeSherpaOnnx.ts +38 -11
- package/src/download/ModelDownloadManager.ts +2 -0
- package/src/download/background-downloader-types.ts +73 -0
- package/src/download/downloadTask.ts +68 -0
- package/src/download/index.ts +2 -0
- package/src/download/postDownloadProcessing.ts +24 -1
- package/src/download/registry.ts +1 -0
- package/src/extraction/extractTarBz2.ts +7 -2
- package/src/extraction/extractTarZst.ts +7 -2
- package/src/extraction/index.ts +29 -6
- package/src/extraction/types.ts +16 -0
- package/src/stt/index.ts +8 -7
- package/src/stt/streaming.ts +7 -1
- package/src/stt/types.ts +18 -0
- package/src/tts/index.ts +10 -7
- package/src/tts/streaming.ts +8 -3
- package/src/tts/types.ts +9 -0
- package/third_party/sherpa-onnx-prebuilt/ANDROID_RELEASE_TAG +1 -1
- package/third_party/sherpa-onnx-prebuilt/IOS_RELEASE_TAG +1 -1
- package/lib/module/download/background-downloader.d.js +0 -2
- package/lib/module/download/background-downloader.d.js.map +0 -1
- package/src/download/background-downloader.d.ts +0 -43
package/ios/SherpaOnnx+STT.mm
CHANGED
|
@@ -36,6 +36,7 @@ static NSString *sttModelKindToNSString(sherpaonnx::SttModelKind kind) {
|
|
|
36
36
|
case K::kZipformerCtc: return @"zipformer_ctc";
|
|
37
37
|
case K::kWhisper: return @"whisper";
|
|
38
38
|
case K::kFunAsrNano: return @"funasr_nano";
|
|
39
|
+
case K::kQwen3Asr: return @"qwen3_asr";
|
|
39
40
|
case K::kFireRedAsr: return @"fire_red_asr";
|
|
40
41
|
case K::kMoonshine: return @"moonshine";
|
|
41
42
|
case K::kMoonshineV2: return @"moonshine_v2";
|
|
@@ -164,10 +165,12 @@ static NSDictionary *sttResultToDict(const sherpaonnx::SttRecognitionResult& r)
|
|
|
164
165
|
sherpaonnx::SttSenseVoiceOptions senseVoiceOpts;
|
|
165
166
|
sherpaonnx::SttCanaryOptions canaryOpts;
|
|
166
167
|
sherpaonnx::SttFunAsrNanoOptions funasrNanoOpts;
|
|
168
|
+
sherpaonnx::SttQwen3AsrOptions qwen3AsrOpts;
|
|
167
169
|
const sherpaonnx::SttWhisperOptions *whisperOptsPtr = nullptr;
|
|
168
170
|
const sherpaonnx::SttSenseVoiceOptions *senseVoiceOptsPtr = nullptr;
|
|
169
171
|
const sherpaonnx::SttCanaryOptions *canaryOptsPtr = nullptr;
|
|
170
172
|
const sherpaonnx::SttFunAsrNanoOptions *funasrNanoOptsPtr = nullptr;
|
|
173
|
+
const sherpaonnx::SttQwen3AsrOptions *qwen3AsrOptsPtr = nullptr;
|
|
171
174
|
if (modelOptions != nil && [modelOptions isKindOfClass:[NSDictionary class]]) {
|
|
172
175
|
NSDictionary *w = modelOptions[@"whisper"];
|
|
173
176
|
if ([w isKindOfClass:[NSDictionary class]]) {
|
|
@@ -202,12 +205,21 @@ static NSDictionary *sttResultToDict(const sherpaonnx::SttRecognitionResult& r)
|
|
|
202
205
|
if (fn[@"hotwords"] != nil) funasrNanoOpts.hotwords = std::string([(NSString *)fn[@"hotwords"] UTF8String]);
|
|
203
206
|
funasrNanoOptsPtr = &funasrNanoOpts;
|
|
204
207
|
}
|
|
208
|
+
NSDictionary *q3 = modelOptions[@"qwen3Asr"];
|
|
209
|
+
if ([q3 isKindOfClass:[NSDictionary class]]) {
|
|
210
|
+
if (q3[@"maxTotalLen"] != nil) qwen3AsrOpts.max_total_len = [(NSNumber *)q3[@"maxTotalLen"] intValue];
|
|
211
|
+
if (q3[@"maxNewTokens"] != nil) qwen3AsrOpts.max_new_tokens = [(NSNumber *)q3[@"maxNewTokens"] intValue];
|
|
212
|
+
if (q3[@"temperature"] != nil) qwen3AsrOpts.temperature = [(NSNumber *)q3[@"temperature"] floatValue];
|
|
213
|
+
if (q3[@"topP"] != nil) qwen3AsrOpts.top_p = [(NSNumber *)q3[@"topP"] floatValue];
|
|
214
|
+
if (q3[@"seed"] != nil) qwen3AsrOpts.seed = [(NSNumber *)q3[@"seed"] intValue];
|
|
215
|
+
qwen3AsrOptsPtr = &qwen3AsrOpts;
|
|
216
|
+
}
|
|
205
217
|
}
|
|
206
218
|
|
|
207
219
|
sherpaonnx::SttInitializeResult result = inst->wrapper->initialize(
|
|
208
220
|
modelDirStr, preferInt8Opt, modelTypeOpt, debugVal, hotwordsFileOpt, hotwordsScoreOpt,
|
|
209
221
|
numThreadsOpt, providerOpt, ruleFstsOpt, ruleFarsOpt, ditherOpt,
|
|
210
|
-
whisperOptsPtr, senseVoiceOptsPtr, canaryOptsPtr, funasrNanoOptsPtr);
|
|
222
|
+
whisperOptsPtr, senseVoiceOptsPtr, canaryOptsPtr, funasrNanoOptsPtr, qwen3AsrOptsPtr);
|
|
211
223
|
|
|
212
224
|
if (result.success) {
|
|
213
225
|
RCTLogInfo(@"Sherpa-onnx initialized successfully");
|
package/ios/SherpaOnnx+TTS.mm
CHANGED
|
@@ -58,6 +58,7 @@ static NSString *ttsModelKindToNSString(sherpaonnx::TtsModelKind kind) {
|
|
|
58
58
|
case K::kKitten: return @"kitten";
|
|
59
59
|
case K::kPocket: return @"pocket";
|
|
60
60
|
case K::kZipvoice: return @"zipvoice";
|
|
61
|
+
case K::kSupertonic: return @"supertonic";
|
|
61
62
|
default: return @"unknown";
|
|
62
63
|
}
|
|
63
64
|
}
|
package/ios/SherpaOnnx.mm
CHANGED
|
@@ -138,9 +138,15 @@
|
|
|
138
138
|
- (void)extractTarBz2:(NSString *)sourcePath
|
|
139
139
|
targetPath:(NSString *)targetPath
|
|
140
140
|
force:(BOOL)force
|
|
141
|
-
|
|
142
|
-
|
|
141
|
+
showNotificationsEnabled:(NSNumber *)showNotificationsEnabled
|
|
142
|
+
notificationTitle:(NSString *)notificationTitle
|
|
143
|
+
notificationText:(NSString *)notificationText
|
|
144
|
+
resolve:(RCTPromiseResolveBlock)resolve
|
|
145
|
+
reject:(RCTPromiseRejectBlock)reject
|
|
143
146
|
{
|
|
147
|
+
(void)showNotificationsEnabled;
|
|
148
|
+
(void)notificationTitle;
|
|
149
|
+
(void)notificationText;
|
|
144
150
|
SherpaOnnxArchiveHelper *helper = [SherpaOnnxArchiveHelper new];
|
|
145
151
|
NSDictionary *result = [helper extractTarBz2:sourcePath
|
|
146
152
|
targetPath:targetPath
|
|
@@ -165,9 +171,15 @@
|
|
|
165
171
|
- (void)extractTarZst:(NSString *)sourcePath
|
|
166
172
|
targetPath:(NSString *)targetPath
|
|
167
173
|
force:(BOOL)force
|
|
168
|
-
|
|
169
|
-
|
|
174
|
+
showNotificationsEnabled:(NSNumber *)showNotificationsEnabled
|
|
175
|
+
notificationTitle:(NSString *)notificationTitle
|
|
176
|
+
notificationText:(NSString *)notificationText
|
|
177
|
+
resolve:(RCTPromiseResolveBlock)resolve
|
|
178
|
+
reject:(RCTPromiseRejectBlock)reject
|
|
170
179
|
{
|
|
180
|
+
(void)showNotificationsEnabled;
|
|
181
|
+
(void)notificationTitle;
|
|
182
|
+
(void)notificationText;
|
|
171
183
|
SherpaOnnxArchiveHelper *helper = [SherpaOnnxArchiveHelper new];
|
|
172
184
|
NSDictionary *result = [helper extractTarZst:sourcePath
|
|
173
185
|
targetPath:targetPath
|
|
@@ -229,19 +241,33 @@
|
|
|
229
241
|
|
|
230
242
|
- (void)extractTarZstFromAsset:(NSString *)assetPath
|
|
231
243
|
targetPath:(NSString *)targetPath
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
244
|
+
force:(BOOL)force
|
|
245
|
+
showNotificationsEnabled:(NSNumber *)showNotificationsEnabled
|
|
246
|
+
notificationTitle:(NSString *)notificationTitle
|
|
247
|
+
notificationText:(NSString *)notificationText
|
|
248
|
+
resolve:(RCTPromiseResolveBlock)resolve
|
|
249
|
+
reject:(RCTPromiseRejectBlock)reject
|
|
235
250
|
{
|
|
251
|
+
(void)force;
|
|
252
|
+
(void)showNotificationsEnabled;
|
|
253
|
+
(void)notificationTitle;
|
|
254
|
+
(void)notificationText;
|
|
236
255
|
resolve(@{ @"success": @NO, @"reason": @"Not supported on iOS; use path-based extraction." });
|
|
237
256
|
}
|
|
238
257
|
|
|
239
258
|
- (void)extractTarBz2FromAsset:(NSString *)assetPath
|
|
240
259
|
targetPath:(NSString *)targetPath
|
|
241
|
-
force:(
|
|
242
|
-
|
|
243
|
-
|
|
260
|
+
force:(BOOL)force
|
|
261
|
+
showNotificationsEnabled:(NSNumber *)showNotificationsEnabled
|
|
262
|
+
notificationTitle:(NSString *)notificationTitle
|
|
263
|
+
notificationText:(NSString *)notificationText
|
|
264
|
+
resolve:(RCTPromiseResolveBlock)resolve
|
|
265
|
+
reject:(RCTPromiseRejectBlock)reject
|
|
244
266
|
{
|
|
267
|
+
(void)force;
|
|
268
|
+
(void)showNotificationsEnabled;
|
|
269
|
+
(void)notificationTitle;
|
|
270
|
+
(void)notificationText;
|
|
245
271
|
resolve(@{ @"success": @NO, @"reason": @"Not supported on iOS; use path-based extraction." });
|
|
246
272
|
}
|
|
247
273
|
|
|
@@ -329,15 +355,59 @@
|
|
|
329
355
|
nil);
|
|
330
356
|
return;
|
|
331
357
|
}
|
|
332
|
-
NSString *
|
|
333
|
-
|
|
358
|
+
NSString *fullPath = nil;
|
|
359
|
+
NSBundle *mainBundle = [NSBundle mainBundle];
|
|
360
|
+
NSString *assetDir = [assetPath stringByDeletingLastPathComponent];
|
|
361
|
+
NSString *assetNameWithExt = [assetPath lastPathComponent];
|
|
362
|
+
NSString *assetName = [assetNameWithExt stringByDeletingPathExtension];
|
|
363
|
+
NSString *assetExt = [assetNameWithExt pathExtension];
|
|
364
|
+
|
|
365
|
+
// 1) App bundle: regular nested path (keeps generic asset support)
|
|
366
|
+
NSString *mainPath = [mainBundle pathForResource:assetName
|
|
367
|
+
ofType:assetExt.length > 0 ? assetExt : nil
|
|
368
|
+
inDirectory:assetDir.length > 0 ? assetDir : nil];
|
|
369
|
+
if (mainPath.length > 0) {
|
|
370
|
+
fullPath = mainPath;
|
|
371
|
+
}
|
|
372
|
+
|
|
373
|
+
// 2) CocoaPods resource bundle: files are flattened into bundle root
|
|
374
|
+
if (!fullPath) {
|
|
375
|
+
NSString *resBundlePath = [mainBundle pathForResource:@"SherpaOnnxResources"
|
|
376
|
+
ofType:@"bundle"];
|
|
377
|
+
if (resBundlePath.length > 0) {
|
|
378
|
+
NSBundle *resBundle = [NSBundle bundleWithPath:resBundlePath];
|
|
379
|
+
if (resBundle) {
|
|
380
|
+
NSString *bundleRootPath = [resBundle pathForResource:assetName
|
|
381
|
+
ofType:assetExt.length > 0 ? assetExt : nil];
|
|
382
|
+
if (bundleRootPath.length > 0) {
|
|
383
|
+
fullPath = bundleRootPath;
|
|
384
|
+
}
|
|
385
|
+
}
|
|
386
|
+
}
|
|
387
|
+
}
|
|
388
|
+
|
|
389
|
+
if (!fullPath) {
|
|
390
|
+
reject(@"ASSET_READ_ERROR",
|
|
391
|
+
[NSString stringWithFormat:@"Failed to locate asset %@", assetPath],
|
|
392
|
+
nil);
|
|
393
|
+
return;
|
|
394
|
+
}
|
|
395
|
+
|
|
334
396
|
NSError *error = nil;
|
|
335
|
-
NSString *content = [NSString stringWithContentsOfFile:fullPath
|
|
336
|
-
|
|
337
|
-
|
|
338
|
-
|
|
339
|
-
|
|
397
|
+
NSString *content = [NSString stringWithContentsOfFile:fullPath
|
|
398
|
+
encoding:NSUTF8StringEncoding
|
|
399
|
+
error:&error];
|
|
400
|
+
if (error || content == nil) {
|
|
401
|
+
reject(@"ASSET_READ_ERROR",
|
|
402
|
+
[NSString stringWithFormat:@"Failed to read asset %@ at %@: %@",
|
|
403
|
+
assetPath,
|
|
404
|
+
fullPath,
|
|
405
|
+
error.localizedDescription ?: @"Unknown error"],
|
|
406
|
+
error);
|
|
407
|
+
return;
|
|
340
408
|
}
|
|
409
|
+
|
|
410
|
+
resolve(content);
|
|
341
411
|
}
|
|
342
412
|
|
|
343
413
|
@end
|
|
@@ -80,6 +80,11 @@ std::vector<LexiconCandidate> FindLexiconCandidates(
|
|
|
80
80
|
const std::string& rootDir
|
|
81
81
|
);
|
|
82
82
|
|
|
83
|
+
bool Qwen3TokenizerDirHasVocabAndMerges(
|
|
84
|
+
const std::vector<FileEntry>& files,
|
|
85
|
+
const std::string& dir
|
|
86
|
+
);
|
|
87
|
+
|
|
83
88
|
} // namespace model_detect
|
|
84
89
|
} // namespace sherpaonnx
|
|
85
90
|
|
|
@@ -257,5 +257,28 @@ std::vector<LexiconCandidate> FindLexiconCandidates(
|
|
|
257
257
|
return candidates;
|
|
258
258
|
}
|
|
259
259
|
|
|
260
|
+
bool Qwen3TokenizerDirHasVocabAndMerges(
|
|
261
|
+
const std::vector<FileEntry>& files,
|
|
262
|
+
const std::string& dirRaw
|
|
263
|
+
) {
|
|
264
|
+
std::string dir = dirRaw;
|
|
265
|
+
while (!dir.empty() && (dir.back() == '/' || dir.back() == '\\'))
|
|
266
|
+
dir.pop_back();
|
|
267
|
+
if (dir.empty()) return false;
|
|
268
|
+
bool hasVocab = false;
|
|
269
|
+
bool hasMerges = false;
|
|
270
|
+
const std::string prefix = dir + "/";
|
|
271
|
+
for (const auto& e : files) {
|
|
272
|
+
if (e.path.size() <= prefix.size()) continue;
|
|
273
|
+
if (e.path.compare(0, prefix.size(), prefix) != 0) continue;
|
|
274
|
+
std::string rest = e.path.substr(prefix.size());
|
|
275
|
+
if (rest.find('/') != std::string::npos || rest.find('\\') != std::string::npos) continue;
|
|
276
|
+
if (e.nameLower == "vocab.json") hasVocab = true;
|
|
277
|
+
if (e.nameLower == "merges.txt") hasMerges = true;
|
|
278
|
+
}
|
|
279
|
+
if (hasVocab && hasMerges) return true;
|
|
280
|
+
return FileExists(dir + "/vocab.json") && FileExists(dir + "/merges.txt");
|
|
281
|
+
}
|
|
282
|
+
|
|
260
283
|
} // namespace model_detect
|
|
261
284
|
} // namespace sherpaonnx
|
|
@@ -58,6 +58,7 @@ static const char* KindToName(SttModelKind k) {
|
|
|
58
58
|
case SttModelKind::kZipformerCtc: return "zipformer_ctc";
|
|
59
59
|
case SttModelKind::kWhisper: return "whisper";
|
|
60
60
|
case SttModelKind::kFunAsrNano: return "funasr_nano";
|
|
61
|
+
case SttModelKind::kQwen3Asr: return "qwen3_asr";
|
|
61
62
|
case SttModelKind::kFireRedAsr: return "fire_red_asr";
|
|
62
63
|
case SttModelKind::kMoonshine: return "moonshine";
|
|
63
64
|
case SttModelKind::kMoonshineV2: return "moonshine_v2";
|
|
@@ -85,6 +86,7 @@ SttModelKind ParseSttModelType(const std::string& modelType) {
|
|
|
85
86
|
if (modelType == "zipformer_ctc" || modelType == "ctc") return SttModelKind::kZipformerCtc;
|
|
86
87
|
if (modelType == "whisper") return SttModelKind::kWhisper;
|
|
87
88
|
if (modelType == "funasr_nano") return SttModelKind::kFunAsrNano;
|
|
89
|
+
if (modelType == "qwen3_asr") return SttModelKind::kQwen3Asr;
|
|
88
90
|
if (modelType == "fire_red_asr") return SttModelKind::kFireRedAsr;
|
|
89
91
|
if (modelType == "moonshine") return SttModelKind::kMoonshine;
|
|
90
92
|
if (modelType == "moonshine_v2") return SttModelKind::kMoonshineV2;
|
|
@@ -123,6 +125,8 @@ static bool CapabilitySupportsKind(
|
|
|
123
125
|
return cap.hasWhisper;
|
|
124
126
|
case SttModelKind::kFunAsrNano:
|
|
125
127
|
return cap.hasFunAsrNano;
|
|
128
|
+
case SttModelKind::kQwen3Asr:
|
|
129
|
+
return cap.hasQwen3Asr;
|
|
126
130
|
case SttModelKind::kFireRedAsr:
|
|
127
131
|
return cap.hasFireRedAsr;
|
|
128
132
|
case SttModelKind::kMoonshine:
|
|
@@ -185,6 +189,8 @@ static std::vector<SttModelKind> GetKindsFromDirName(const std::string& modelDir
|
|
|
185
189
|
add(SttModelKind::kTransducer);
|
|
186
190
|
add(SttModelKind::kZipformerCtc);
|
|
187
191
|
}
|
|
192
|
+
if (lower.find("qwen3-asr") != std::string::npos || lower.find("qwen3_asr") != std::string::npos)
|
|
193
|
+
add(SttModelKind::kQwen3Asr);
|
|
188
194
|
if (lower.find("funasr") != std::string::npos)
|
|
189
195
|
add(SttModelKind::kFunAsrNano);
|
|
190
196
|
if (lower.find("canary") != std::string::npos)
|
|
@@ -245,6 +251,19 @@ static SttCandidatePaths GatherSttCandidatePaths(
|
|
|
245
251
|
p.funasrTokenizerDir = vocabInSubdir.substr(0, lastSlash);
|
|
246
252
|
}
|
|
247
253
|
}
|
|
254
|
+
p.qwen3ConvFrontend = FindOnnxByAnyToken(files, {"conv_frontend"}, preferInt8);
|
|
255
|
+
{
|
|
256
|
+
for (const auto& entry : files) {
|
|
257
|
+
if (entry.nameLower != "tokenizer_config.json") continue;
|
|
258
|
+
size_t slash = entry.path.find_last_of("/\\");
|
|
259
|
+
if (slash == std::string::npos) continue;
|
|
260
|
+
std::string dir = entry.path.substr(0, slash);
|
|
261
|
+
if (Qwen3TokenizerDirHasVocabAndMerges(files, dir)) {
|
|
262
|
+
p.qwen3TokenizerDir = dir;
|
|
263
|
+
break;
|
|
264
|
+
}
|
|
265
|
+
}
|
|
266
|
+
}
|
|
248
267
|
p.moonshinePreprocessor = FindOnnxByAnyToken(files, {"preprocess", "preprocessor"}, preferInt8);
|
|
249
268
|
p.moonshineEncoder = FindOnnxByAnyToken(files, {"encode", "encoder_model"}, preferInt8);
|
|
250
269
|
p.moonshineUncachedDecoder = FindOnnxByAnyToken(files, {"uncached_decode", "uncached"}, preferInt8);
|
|
@@ -254,7 +273,8 @@ static SttCandidatePaths GatherSttCandidatePaths(
|
|
|
254
273
|
static const std::vector<std::string> modelExcludes = {
|
|
255
274
|
"encoder", "decoder", "joiner", "vocoder", "acoustic", "embedding", "llm",
|
|
256
275
|
"encoder_adaptor", "encoder-adaptor", "encoder_model", "decoder_model",
|
|
257
|
-
"merged_decoder", "decoder_model_merged", "preprocess", "encode", "uncached", "cached"
|
|
276
|
+
"merged_decoder", "decoder_model_merged", "preprocess", "encode", "uncached", "cached",
|
|
277
|
+
"conv_frontend"
|
|
258
278
|
};
|
|
259
279
|
p.paraformerModel = FindOnnxByAnyToken(files, {"model"}, preferInt8);
|
|
260
280
|
if (!p.paraformerModel.empty()) {
|
|
@@ -297,6 +317,7 @@ static SttPathHints GetSttPathHints(const std::string& modelDir) {
|
|
|
297
317
|
h.isLikelyWenetCtc = lower.find("wenet") != std::string::npos;
|
|
298
318
|
h.isLikelySenseVoice = lower.find("sense") != std::string::npos || lower.find("sensevoice") != std::string::npos;
|
|
299
319
|
h.isLikelyFunAsrNano = lower.find("funasr") != std::string::npos || lower.find("funasr-nano") != std::string::npos;
|
|
320
|
+
h.isLikelyQwen3Asr = lower.find("qwen3-asr") != std::string::npos || lower.find("qwen3_asr") != std::string::npos;
|
|
300
321
|
h.isLikelyZipformer = lower.find("zipformer") != std::string::npos;
|
|
301
322
|
h.isLikelyMoonshine = lower.find("moonshine") != std::string::npos;
|
|
302
323
|
h.isLikelyDolphin = lower.find("dolphin") != std::string::npos;
|
|
@@ -338,7 +359,9 @@ static SttCapabilities ComputeSttCapabilities(const SttCandidatePaths& paths, co
|
|
|
338
359
|
c.hasTransducer = !paths.encoder.empty() && !paths.decoder.empty() && !paths.joiner.empty();
|
|
339
360
|
bool hasWhisperEnc = !paths.encoder.empty();
|
|
340
361
|
bool hasWhisperDec = !paths.decoder.empty();
|
|
341
|
-
|
|
362
|
+
bool hasQwen3Tok = !paths.qwen3TokenizerDir.empty();
|
|
363
|
+
c.hasQwen3Asr = !paths.qwen3ConvFrontend.empty() && hasWhisperEnc && hasWhisperDec && hasQwen3Tok;
|
|
364
|
+
c.hasWhisper = hasWhisperEnc && hasWhisperDec && paths.joiner.empty() && !c.hasQwen3Asr;
|
|
342
365
|
bool hasFunAsrTok = !paths.funasrTokenizerDir.empty();
|
|
343
366
|
c.hasFunAsrNano = !paths.funasrEncoderAdaptor.empty() && !paths.funasrLLM.empty() &&
|
|
344
367
|
!paths.funasrEmbedding.empty() && hasFunAsrTok;
|
|
@@ -378,6 +401,7 @@ static void CollectDetectedModels(
|
|
|
378
401
|
out.push_back({"paraformer", modelDir});
|
|
379
402
|
}
|
|
380
403
|
if (cap.hasWhisper) out.push_back({"whisper", modelDir});
|
|
404
|
+
if (cap.hasQwen3Asr) out.push_back({"qwen3_asr", modelDir});
|
|
381
405
|
if (cap.hasFunAsrNano) out.push_back({"funasr_nano", modelDir});
|
|
382
406
|
if (cap.hasMoonshine) out.push_back({"moonshine", modelDir});
|
|
383
407
|
if (cap.hasMoonshineV2) out.push_back({"moonshine_v2", modelDir});
|
|
@@ -439,6 +463,10 @@ static SttModelKind ResolveSttKind(
|
|
|
439
463
|
outError = "FunASR Nano model requested but required files not found in " + modelDir;
|
|
440
464
|
return SttModelKind::kUnknown;
|
|
441
465
|
}
|
|
466
|
+
if (selected == SttModelKind::kQwen3Asr && !cap.hasQwen3Asr) {
|
|
467
|
+
outError = "Qwen3-ASR model requested but conv_frontend/encoder/decoder/tokenizer not found in " + modelDir;
|
|
468
|
+
return SttModelKind::kUnknown;
|
|
469
|
+
}
|
|
442
470
|
if (selected == SttModelKind::kMoonshine && !cap.hasMoonshine) {
|
|
443
471
|
outError = "Moonshine v1 model requested but preprocess/encode/uncached_decode/cached_decode not found in " + modelDir;
|
|
444
472
|
return SttModelKind::kUnknown;
|
|
@@ -505,7 +533,9 @@ static SttModelKind ResolveSttKind(
|
|
|
505
533
|
if (!paths.paraformerModel.empty()) return SttModelKind::kParaformer;
|
|
506
534
|
if (cap.hasCanary) return SttModelKind::kCanary;
|
|
507
535
|
if (cap.hasFireRedAsr) return SttModelKind::kFireRedAsr;
|
|
536
|
+
if (cap.hasQwen3Asr && hints.isLikelyQwen3Asr) return SttModelKind::kQwen3Asr;
|
|
508
537
|
if (cap.hasWhisper) return SttModelKind::kWhisper;
|
|
538
|
+
if (cap.hasQwen3Asr) return SttModelKind::kQwen3Asr;
|
|
509
539
|
if (cap.hasFunAsrNano) return SttModelKind::kFunAsrNano;
|
|
510
540
|
if (cap.hasMoonshineV2) return SttModelKind::kMoonshineV2;
|
|
511
541
|
if (cap.hasDolphin) return SttModelKind::kDolphin;
|
|
@@ -551,6 +581,12 @@ static void ApplyPathsForSttKind(SttModelKind kind, const SttCandidatePaths& can
|
|
|
551
581
|
resultPaths.funasrEmbedding = candidate.funasrEmbedding;
|
|
552
582
|
resultPaths.funasrTokenizer = candidate.funasrTokenizerDir;
|
|
553
583
|
break;
|
|
584
|
+
case SttModelKind::kQwen3Asr:
|
|
585
|
+
resultPaths.qwen3ConvFrontend = candidate.qwen3ConvFrontend;
|
|
586
|
+
resultPaths.qwen3Encoder = candidate.encoder;
|
|
587
|
+
resultPaths.qwen3Decoder = candidate.decoder;
|
|
588
|
+
resultPaths.qwen3Tokenizer = candidate.qwen3TokenizerDir;
|
|
589
|
+
break;
|
|
554
590
|
case SttModelKind::kMoonshine:
|
|
555
591
|
resultPaths.moonshinePreprocessor = candidate.moonshinePreprocessor;
|
|
556
592
|
resultPaths.moonshineEncoder = candidate.moonshineEncoder;
|
|
@@ -624,13 +660,15 @@ SttDetectResult DetectSttModel(
|
|
|
624
660
|
EmptyOrPath(candidate.encoder), EmptyOrPath(candidate.decoder));
|
|
625
661
|
LOGI("DetectSttModel: funasr encoderAdaptor=%s llm=%s embedding=%s tokenizerDir=%s",
|
|
626
662
|
EmptyOrPath(candidate.funasrEncoderAdaptor), EmptyOrPath(candidate.funasrLLM), EmptyOrPath(candidate.funasrEmbedding), EmptyOrPath(candidate.funasrTokenizerDir));
|
|
627
|
-
LOGI("DetectSttModel:
|
|
663
|
+
LOGI("DetectSttModel: qwen3_asr conv=%s tokenizerDir=%s",
|
|
664
|
+
EmptyOrPath(candidate.qwen3ConvFrontend), EmptyOrPath(candidate.qwen3TokenizerDir));
|
|
665
|
+
LOGI("DetectSttModel: hasTransducer=%d hasWhisper=%d hasMoonshine=%d hasMoonshineV2=%d hasParaformer=%d hasFunAsrNano=%d hasQwen3Asr=%d hasDolphin=%d hasFireRedAsr=%d hasFireRedCtc=%d hasCanary=%d hasOmnilingual=%d hasMedAsr=%d hasTeleSpeechCtc=%d hasToneCtc=%d",
|
|
628
666
|
(int)cap.hasTransducer, (int)cap.hasWhisper, (int)cap.hasMoonshine, (int)cap.hasMoonshineV2,
|
|
629
|
-
(int)cap.hasParaformer, (int)cap.hasFunAsrNano, (int)cap.hasDolphin, (int)cap.hasFireRedAsr, (int)cap.hasFireRedCtc,
|
|
667
|
+
(int)cap.hasParaformer, (int)cap.hasFunAsrNano, (int)cap.hasQwen3Asr, (int)cap.hasDolphin, (int)cap.hasFireRedAsr, (int)cap.hasFireRedCtc,
|
|
630
668
|
(int)cap.hasCanary, (int)cap.hasOmnilingual, (int)cap.hasMedAsr, (int)cap.hasTeleSpeechCtc, (int)cap.hasToneCtc);
|
|
631
|
-
LOGI("DetectSttModel: hints isLikelyNemo=%d isLikelyTdt=%d isLikelyWenetCtc=%d isLikelySenseVoice=%d isLikelyFunAsrNano=%d isLikelyZipformer=%d isLikelyMoonshine=%d isLikelyDolphin=%d isLikelyFireRedAsr=%d isLikelyCanary=%d isLikelyOmnilingual=%d isLikelyMedAsr=%d isLikelyTeleSpeech=%d isLikelyToneCtc=%d isLikelyParaformer=%d isLikelyVad=%d isLikelyTdnn=%d",
|
|
669
|
+
LOGI("DetectSttModel: hints isLikelyNemo=%d isLikelyTdt=%d isLikelyWenetCtc=%d isLikelySenseVoice=%d isLikelyFunAsrNano=%d isLikelyQwen3Asr=%d isLikelyZipformer=%d isLikelyMoonshine=%d isLikelyDolphin=%d isLikelyFireRedAsr=%d isLikelyCanary=%d isLikelyOmnilingual=%d isLikelyMedAsr=%d isLikelyTeleSpeech=%d isLikelyToneCtc=%d isLikelyParaformer=%d isLikelyVad=%d isLikelyTdnn=%d",
|
|
632
670
|
(int)hints.isLikelyNemo, (int)hints.isLikelyTdt, (int)hints.isLikelyWenetCtc, (int)hints.isLikelySenseVoice,
|
|
633
|
-
(int)hints.isLikelyFunAsrNano, (int)hints.isLikelyZipformer, (int)hints.isLikelyMoonshine, (int)hints.isLikelyDolphin,
|
|
671
|
+
(int)hints.isLikelyFunAsrNano, (int)hints.isLikelyQwen3Asr, (int)hints.isLikelyZipformer, (int)hints.isLikelyMoonshine, (int)hints.isLikelyDolphin,
|
|
634
672
|
(int)hints.isLikelyFireRedAsr, (int)hints.isLikelyCanary, (int)hints.isLikelyOmnilingual, (int)hints.isLikelyMedAsr,
|
|
635
673
|
(int)hints.isLikelyTeleSpeech, (int)hints.isLikelyToneCtc, (int)hints.isLikelyParaformer, (int)hints.isLikelyVad, (int)hints.isLikelyTdnn);
|
|
636
674
|
}
|
|
@@ -653,7 +691,8 @@ SttDetectResult DetectSttModel(
|
|
|
653
691
|
}
|
|
654
692
|
|
|
655
693
|
LOGI("DetectSttModel: selected kind=%d (%s)", static_cast<int>(result.selectedKind), KindToName(result.selectedKind));
|
|
656
|
-
result.tokensRequired = (result.selectedKind != SttModelKind::kFunAsrNano
|
|
694
|
+
result.tokensRequired = (result.selectedKind != SttModelKind::kFunAsrNano &&
|
|
695
|
+
result.selectedKind != SttModelKind::kQwen3Asr);
|
|
657
696
|
ApplyPathsForSttKind(result.selectedKind, candidate, result.paths);
|
|
658
697
|
|
|
659
698
|
if (!candidate.tokens.empty() && FileExists(candidate.tokens)) {
|
|
@@ -711,6 +750,11 @@ SttDetectResult DetectSttModel(
|
|
|
711
750
|
EmptyOrPath(result.paths.funasrEncoderAdaptor), EmptyOrPath(result.paths.funasrLLM),
|
|
712
751
|
EmptyOrPath(result.paths.funasrEmbedding), EmptyOrPath(result.paths.funasrTokenizer));
|
|
713
752
|
break;
|
|
753
|
+
case SttModelKind::kQwen3Asr:
|
|
754
|
+
LOGI("DetectSttModel: paths set qwen3_asr conv=%s encoder=%s decoder=%s tokenizer=%s",
|
|
755
|
+
EmptyOrPath(result.paths.qwen3ConvFrontend), EmptyOrPath(result.paths.qwen3Encoder),
|
|
756
|
+
EmptyOrPath(result.paths.qwen3Decoder), EmptyOrPath(result.paths.qwen3Tokenizer));
|
|
757
|
+
break;
|
|
714
758
|
default:
|
|
715
759
|
break;
|
|
716
760
|
}
|
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
* sherpa-onnx-model-detect-tts.mm
|
|
3
3
|
*
|
|
4
4
|
* Purpose: Detects TTS (text-to-speech) model type and fills TtsModelPaths from a model directory.
|
|
5
|
-
* Used by the TTS wrapper on iOS. Supports Vits, Matcha, Kokoro, Kitten, Pocket, Zipvoice.
|
|
5
|
+
* Used by the TTS wrapper on iOS. Supports Vits, Matcha, Kokoro, Kitten, Pocket, Zipvoice, Supertonic.
|
|
6
6
|
*
|
|
7
7
|
* --- Detection pipeline (overview) ---
|
|
8
8
|
*
|
|
@@ -58,6 +58,7 @@ TtsModelKind ParseTtsModelType(const std::string& modelType) {
|
|
|
58
58
|
if (modelType == "kitten") return TtsModelKind::kKitten;
|
|
59
59
|
if (modelType == "pocket") return TtsModelKind::kPocket;
|
|
60
60
|
if (modelType == "zipvoice") return TtsModelKind::kZipvoice;
|
|
61
|
+
if (modelType == "supertonic") return TtsModelKind::kSupertonic;
|
|
61
62
|
return TtsModelKind::kUnknown;
|
|
62
63
|
}
|
|
63
64
|
|
|
@@ -70,6 +71,7 @@ static bool CapabilitySupportsTtsKind(
|
|
|
70
71
|
bool hasMatcha,
|
|
71
72
|
bool hasPocket,
|
|
72
73
|
bool hasZipvoice,
|
|
74
|
+
bool hasSupertonic,
|
|
73
75
|
bool hasVoicesFile,
|
|
74
76
|
bool hasDataDir
|
|
75
77
|
) {
|
|
@@ -85,6 +87,8 @@ static bool CapabilitySupportsTtsKind(
|
|
|
85
87
|
return hasPocket;
|
|
86
88
|
case TtsModelKind::kZipvoice:
|
|
87
89
|
return hasZipvoice;
|
|
90
|
+
case TtsModelKind::kSupertonic:
|
|
91
|
+
return hasSupertonic;
|
|
88
92
|
default:
|
|
89
93
|
return false;
|
|
90
94
|
}
|
|
@@ -109,6 +113,7 @@ static std::vector<TtsModelKind> GetKindsFromDirNameTts(const std::string& model
|
|
|
109
113
|
if (lower.find("matcha") != std::string::npos) add(TtsModelKind::kMatcha);
|
|
110
114
|
if (lower.find("pocket") != std::string::npos) add(TtsModelKind::kPocket);
|
|
111
115
|
if (lower.find("zipvoice") != std::string::npos) add(TtsModelKind::kZipvoice);
|
|
116
|
+
if (lower.find("supertonic") != std::string::npos) add(TtsModelKind::kSupertonic);
|
|
112
117
|
if (lower.find("kokoro") != std::string::npos) add(TtsModelKind::kKokoro);
|
|
113
118
|
if (lower.find("kitten") != std::string::npos) add(TtsModelKind::kKitten);
|
|
114
119
|
if (lower.find("vits") != std::string::npos) add(TtsModelKind::kVits);
|
|
@@ -154,14 +159,27 @@ TtsDetectResult DetectTtsModel(const std::string& modelDir, const std::string& m
|
|
|
154
159
|
std::string textConditioner = FindOnnxByAnyToken(files, {"text_conditioner", "text-conditioner"}, std::nullopt);
|
|
155
160
|
std::string vocabJsonFile = FindFileByName(files, "vocab.json");
|
|
156
161
|
std::string tokenScoresJsonFile = FindFileByName(files, "token_scores.json");
|
|
162
|
+
std::string durationPredictor = FindOnnxByAnyToken(files, {"duration_predictor", "duration-predictor"}, std::nullopt);
|
|
163
|
+
std::string textEncoderSupertonic = FindOnnxByAnyToken(files, {"text_encoder", "text-encoder"}, std::nullopt);
|
|
164
|
+
std::string vectorEstimator = FindOnnxByAnyToken(files, {"vector_estimator", "vector-estimator"}, std::nullopt);
|
|
165
|
+
std::string ttsJsonFile = FindFileByName(files, "tts.json");
|
|
166
|
+
std::string unicodeIndexerFile = FindFileByName(files, "unicode_indexer.bin");
|
|
167
|
+
std::string voiceStyleFile = FindFileByName(files, "voice.bin");
|
|
157
168
|
|
|
158
|
-
std::vector<std::string> modelExcludes = {
|
|
169
|
+
std::vector<std::string> modelExcludes = {
|
|
170
|
+
"acoustic", "vocoder", "encoder", "decoder", "joiner",
|
|
171
|
+
// Supertonic component models are not VITS monolithic model.onnx files.
|
|
172
|
+
"duration_predictor", "duration-predictor",
|
|
173
|
+
"text_encoder", "text-encoder",
|
|
174
|
+
"vector_estimator", "vector-estimator"
|
|
175
|
+
};
|
|
159
176
|
std::string ttsModel = FindOnnxByAnyToken(files, {"model"}, std::nullopt);
|
|
160
177
|
if (ttsModel.empty()) {
|
|
161
178
|
ttsModel = FindLargestOnnxExcludingTokens(files, modelExcludes);
|
|
162
179
|
}
|
|
163
180
|
|
|
164
|
-
|
|
181
|
+
// VITS requires both model.onnx-like file and tokens.txt
|
|
182
|
+
bool hasVits = !ttsModel.empty() && !tokensFile.empty();
|
|
165
183
|
std::string modelDirLower = ToLower(modelDir);
|
|
166
184
|
bool isLikelyMatcha = modelDirLower.find("matcha") != std::string::npos;
|
|
167
185
|
bool hasMatcha = (!acousticModel.empty() && !vocoder.empty())
|
|
@@ -178,6 +196,9 @@ TtsDetectResult DetectTtsModel(const std::string& modelDir, const std::string& m
|
|
|
178
196
|
}
|
|
179
197
|
bool hasPocket = !lmFlow.empty() && !lmMain.empty() && !encoder.empty() && !decoder.empty() &&
|
|
180
198
|
!textConditioner.empty() && !vocabJsonFile.empty() && !tokenScoresJsonFile.empty();
|
|
199
|
+
bool hasSupertonic = !durationPredictor.empty() && !textEncoderSupertonic.empty() &&
|
|
200
|
+
!vectorEstimator.empty() && !vocoder.empty() && !ttsJsonFile.empty() &&
|
|
201
|
+
!unicodeIndexerFile.empty() && !voiceStyleFile.empty();
|
|
181
202
|
bool hasDataDir = !dataDirPath.empty();
|
|
182
203
|
|
|
183
204
|
bool isLikelyKitten = modelDirLower.find("kitten") != std::string::npos;
|
|
@@ -192,6 +213,9 @@ TtsDetectResult DetectTtsModel(const std::string& modelDir, const std::string& m
|
|
|
192
213
|
if (hasZipvoice && !hasMatcha) {
|
|
193
214
|
result.detectedModels.push_back({"zipvoice", modelDir});
|
|
194
215
|
}
|
|
216
|
+
if (hasSupertonic) {
|
|
217
|
+
result.detectedModels.push_back({"supertonic", modelDir});
|
|
218
|
+
}
|
|
195
219
|
if (hasVoicesFile) {
|
|
196
220
|
if (isLikelyKitten && !isLikelyKokoro) {
|
|
197
221
|
result.detectedModels.push_back({"kitten", modelDir});
|
|
@@ -228,7 +252,7 @@ TtsDetectResult DetectTtsModel(const std::string& modelDir, const std::string& m
|
|
|
228
252
|
std::vector<TtsModelKind> nameCandidates = GetKindsFromDirNameTts(modelDir);
|
|
229
253
|
if (!nameCandidates.empty()) {
|
|
230
254
|
for (TtsModelKind k : nameCandidates) {
|
|
231
|
-
if (CapabilitySupportsTtsKind(k, hasVits, hasMatcha, hasPocket, hasZipvoice,
|
|
255
|
+
if (CapabilitySupportsTtsKind(k, hasVits, hasMatcha, hasPocket, hasZipvoice, hasSupertonic,
|
|
232
256
|
hasVoicesFile, hasDataDir)) {
|
|
233
257
|
selected = k;
|
|
234
258
|
break;
|
|
@@ -243,6 +267,8 @@ TtsDetectResult DetectTtsModel(const std::string& modelDir, const std::string& m
|
|
|
243
267
|
selected = TtsModelKind::kPocket;
|
|
244
268
|
} else if (hasZipvoice) {
|
|
245
269
|
selected = TtsModelKind::kZipvoice;
|
|
270
|
+
} else if (hasSupertonic) {
|
|
271
|
+
selected = TtsModelKind::kSupertonic;
|
|
246
272
|
} else if (hasVoicesFile) {
|
|
247
273
|
if (isLikelyKitten && !isLikelyKokoro) {
|
|
248
274
|
selected = TtsModelKind::kKitten;
|
|
@@ -289,6 +315,12 @@ TtsDetectResult DetectTtsModel(const std::string& modelDir, const std::string& m
|
|
|
289
315
|
result.paths.textConditioner = textConditioner;
|
|
290
316
|
result.paths.vocabJson = vocabJsonFile;
|
|
291
317
|
result.paths.tokenScoresJson = tokenScoresJsonFile;
|
|
318
|
+
result.paths.durationPredictor = durationPredictor;
|
|
319
|
+
result.paths.textEncoder = textEncoderSupertonic;
|
|
320
|
+
result.paths.vectorEstimator = vectorEstimator;
|
|
321
|
+
result.paths.ttsJson = ttsJsonFile;
|
|
322
|
+
result.paths.unicodeIndexer = unicodeIndexerFile;
|
|
323
|
+
result.paths.voiceStyle = voiceStyleFile;
|
|
292
324
|
|
|
293
325
|
auto validation = ValidateTtsPaths(selected, result.paths, modelDir);
|
|
294
326
|
if (!validation.ok) {
|
|
@@ -19,6 +19,7 @@ enum class SttModelKind {
|
|
|
19
19
|
kZipformerCtc,
|
|
20
20
|
kWhisper,
|
|
21
21
|
kFunAsrNano,
|
|
22
|
+
kQwen3Asr,
|
|
22
23
|
kFireRedAsr,
|
|
23
24
|
kMoonshine,
|
|
24
25
|
kMoonshineV2,
|
|
@@ -37,7 +38,8 @@ enum class TtsModelKind {
|
|
|
37
38
|
kKokoro,
|
|
38
39
|
kKitten,
|
|
39
40
|
kPocket,
|
|
40
|
-
kZipvoice
|
|
41
|
+
kZipvoice,
|
|
42
|
+
kSupertonic
|
|
41
43
|
};
|
|
42
44
|
|
|
43
45
|
struct SttModelPaths {
|
|
@@ -55,6 +57,11 @@ struct SttModelPaths {
|
|
|
55
57
|
std::string funasrLLM;
|
|
56
58
|
std::string funasrEmbedding;
|
|
57
59
|
std::string funasrTokenizer;
|
|
60
|
+
/** Qwen3-ASR: conv_frontend.onnx + encoder + decoder + tokenizer dir (vocab.json, merges.txt, tokenizer_config.json). */
|
|
61
|
+
std::string qwen3ConvFrontend;
|
|
62
|
+
std::string qwen3Encoder;
|
|
63
|
+
std::string qwen3Decoder;
|
|
64
|
+
std::string qwen3Tokenizer;
|
|
58
65
|
std::string moonshinePreprocessor;
|
|
59
66
|
std::string moonshineEncoder;
|
|
60
67
|
std::string moonshineUncachedDecoder;
|
|
@@ -84,6 +91,8 @@ struct SttCandidatePaths {
|
|
|
84
91
|
std::string funasrLLM;
|
|
85
92
|
std::string funasrEmbedding;
|
|
86
93
|
std::string funasrTokenizerDir;
|
|
94
|
+
std::string qwen3ConvFrontend;
|
|
95
|
+
std::string qwen3TokenizerDir;
|
|
87
96
|
std::string moonshinePreprocessor;
|
|
88
97
|
std::string moonshineEncoder;
|
|
89
98
|
std::string moonshineUncachedDecoder;
|
|
@@ -99,6 +108,7 @@ struct SttPathHints {
|
|
|
99
108
|
bool isLikelyWenetCtc = false;
|
|
100
109
|
bool isLikelySenseVoice = false;
|
|
101
110
|
bool isLikelyFunAsrNano = false;
|
|
111
|
+
bool isLikelyQwen3Asr = false;
|
|
102
112
|
bool isLikelyZipformer = false;
|
|
103
113
|
bool isLikelyMoonshine = false;
|
|
104
114
|
bool isLikelyDolphin = false;
|
|
@@ -123,6 +133,7 @@ struct SttCapabilities {
|
|
|
123
133
|
bool hasMoonshineV2 = false;
|
|
124
134
|
bool hasParaformer = false;
|
|
125
135
|
bool hasFunAsrNano = false;
|
|
136
|
+
bool hasQwen3Asr = false;
|
|
126
137
|
bool hasDolphin = false;
|
|
127
138
|
bool hasFireRedAsr = false;
|
|
128
139
|
/** True when dir name suggests Fire Red but only a single CTC/paraformer model (no encoder/decoder). Use zipformer_ctc. */
|
|
@@ -150,6 +161,13 @@ struct TtsModelPaths {
|
|
|
150
161
|
std::string textConditioner;
|
|
151
162
|
std::string vocabJson;
|
|
152
163
|
std::string tokenScoresJson;
|
|
164
|
+
// Supertonic TTS
|
|
165
|
+
std::string durationPredictor;
|
|
166
|
+
std::string textEncoder;
|
|
167
|
+
std::string vectorEstimator;
|
|
168
|
+
std::string ttsJson;
|
|
169
|
+
std::string unicodeIndexer;
|
|
170
|
+
std::string voiceStyle;
|
|
153
171
|
};
|
|
154
172
|
|
|
155
173
|
struct SttDetectResult {
|
|
@@ -52,6 +52,13 @@ static const SttFieldRequirement kFunAsrNanoReqs[] = {
|
|
|
52
52
|
{"funasrTokenizer", &SttModelPaths::funasrTokenizer, true},
|
|
53
53
|
};
|
|
54
54
|
|
|
55
|
+
static const SttFieldRequirement kQwen3AsrReqs[] = {
|
|
56
|
+
{"qwen3ConvFrontend", &SttModelPaths::qwen3ConvFrontend, true},
|
|
57
|
+
{"qwen3Encoder", &SttModelPaths::qwen3Encoder, true},
|
|
58
|
+
{"qwen3Decoder", &SttModelPaths::qwen3Decoder, true},
|
|
59
|
+
{"qwen3Tokenizer", &SttModelPaths::qwen3Tokenizer, true},
|
|
60
|
+
};
|
|
61
|
+
|
|
55
62
|
static const SttFieldRequirement kMoonshineReqs[] = {
|
|
56
63
|
{"moonshinePreprocessor", &SttModelPaths::moonshinePreprocessor, true},
|
|
57
64
|
{"moonshineEncoder", &SttModelPaths::moonshineEncoder, true},
|
|
@@ -120,6 +127,9 @@ static const SttFieldRequirement* GetRequirements(SttModelKind kind, size_t& cou
|
|
|
120
127
|
case SttModelKind::kFunAsrNano:
|
|
121
128
|
count = std::size(kFunAsrNanoReqs);
|
|
122
129
|
return kFunAsrNanoReqs;
|
|
130
|
+
case SttModelKind::kQwen3Asr:
|
|
131
|
+
count = std::size(kQwen3AsrReqs);
|
|
132
|
+
return kQwen3AsrReqs;
|
|
123
133
|
case SttModelKind::kMoonshine:
|
|
124
134
|
count = std::size(kMoonshineReqs);
|
|
125
135
|
return kMoonshineReqs;
|
|
@@ -161,6 +171,7 @@ static const char* SttKindToName(SttModelKind k) {
|
|
|
161
171
|
case SttModelKind::kZipformerCtc: return "Zipformer CTC";
|
|
162
172
|
case SttModelKind::kWhisper: return "Whisper";
|
|
163
173
|
case SttModelKind::kFunAsrNano: return "FunASR Nano";
|
|
174
|
+
case SttModelKind::kQwen3Asr: return "Qwen3 ASR";
|
|
164
175
|
case SttModelKind::kFireRedAsr: return "Fire Red ASR";
|
|
165
176
|
case SttModelKind::kMoonshine: return "Moonshine";
|
|
166
177
|
case SttModelKind::kMoonshineV2: return "Moonshine v2";
|
|
@@ -59,6 +59,16 @@ static const TtsFieldRequirement kZipvoiceReqs[] = {
|
|
|
59
59
|
{"lexicon", &TtsModelPaths::lexicon, true},
|
|
60
60
|
};
|
|
61
61
|
|
|
62
|
+
static const TtsFieldRequirement kSupertonicReqs[] = {
|
|
63
|
+
{"durationPredictor", &TtsModelPaths::durationPredictor, true},
|
|
64
|
+
{"textEncoder", &TtsModelPaths::textEncoder, true},
|
|
65
|
+
{"vectorEstimator", &TtsModelPaths::vectorEstimator, true},
|
|
66
|
+
{"vocoder", &TtsModelPaths::vocoder, true},
|
|
67
|
+
{"ttsJson", &TtsModelPaths::ttsJson, true},
|
|
68
|
+
{"unicodeIndexer", &TtsModelPaths::unicodeIndexer, true},
|
|
69
|
+
{"voiceStyle", &TtsModelPaths::voiceStyle, true},
|
|
70
|
+
};
|
|
71
|
+
|
|
62
72
|
// ============================================================
|
|
63
73
|
|
|
64
74
|
static const TtsFieldRequirement* GetRequirements(TtsModelKind kind, size_t& count) {
|
|
@@ -79,6 +89,9 @@ static const TtsFieldRequirement* GetRequirements(TtsModelKind kind, size_t& cou
|
|
|
79
89
|
case TtsModelKind::kZipvoice:
|
|
80
90
|
count = std::size(kZipvoiceReqs);
|
|
81
91
|
return kZipvoiceReqs;
|
|
92
|
+
case TtsModelKind::kSupertonic:
|
|
93
|
+
count = std::size(kSupertonicReqs);
|
|
94
|
+
return kSupertonicReqs;
|
|
82
95
|
default:
|
|
83
96
|
count = 0;
|
|
84
97
|
return nullptr;
|
|
@@ -93,6 +106,7 @@ static const char* TtsKindToName(TtsModelKind k) {
|
|
|
93
106
|
case TtsModelKind::kKitten: return "Kitten";
|
|
94
107
|
case TtsModelKind::kPocket: return "Pocket";
|
|
95
108
|
case TtsModelKind::kZipvoice: return "Zipvoice";
|
|
109
|
+
case TtsModelKind::kSupertonic: return "Supertonic";
|
|
96
110
|
default: return "Unknown";
|
|
97
111
|
}
|
|
98
112
|
}
|