react-native-sherpa-onnx 0.2.0 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (175) hide show
  1. package/README.md +232 -236
  2. package/SherpaOnnx.podspec +68 -64
  3. package/android/build.gradle +182 -192
  4. package/android/codegen.gradle +57 -0
  5. package/android/prebuilt-download.gradle +428 -0
  6. package/android/prebuilt-versions.gradle +43 -0
  7. package/android/proguard-rules.pro +10 -0
  8. package/android/src/main/assets/testModels/add_mul_add.onnx +28 -0
  9. package/android/src/main/assets/testModels/nnapi_internal_uint8_support.onnx +0 -0
  10. package/android/src/main/assets/testModels/qnn_multi_ctx_embed.onnx +0 -0
  11. package/android/src/main/cpp/CMakeLists.txt +166 -129
  12. package/android/src/main/cpp/CMakePresets.json +54 -0
  13. package/android/src/main/cpp/crypto/sha256.cpp +174 -0
  14. package/android/src/main/cpp/crypto/sha256.h +16 -0
  15. package/android/src/main/cpp/jni/archive/sherpa-onnx-archive-helper.cpp +404 -0
  16. package/android/src/main/cpp/jni/archive/sherpa-onnx-archive-helper.h +56 -0
  17. package/android/src/main/cpp/jni/archive/sherpa-onnx-archive-jni.cpp +181 -0
  18. package/android/src/main/cpp/jni/audio/sherpa-onnx-audio-convert-jni.cpp +888 -0
  19. package/{ios → android/src/main/cpp/jni/model_detect}/sherpa-onnx-common.h +18 -18
  20. package/android/src/main/cpp/jni/model_detect/sherpa-onnx-detect-jni-common.cpp +86 -0
  21. package/android/src/main/cpp/jni/model_detect/sherpa-onnx-detect-jni-common.h +20 -0
  22. package/android/src/main/cpp/jni/model_detect/sherpa-onnx-model-detect-helper.cpp +423 -0
  23. package/android/src/main/cpp/jni/model_detect/sherpa-onnx-model-detect-helper.h +55 -0
  24. package/android/src/main/cpp/jni/model_detect/sherpa-onnx-model-detect-stt.cpp +399 -0
  25. package/android/src/main/cpp/jni/model_detect/sherpa-onnx-model-detect-tts.cpp +238 -0
  26. package/{ios → android/src/main/cpp/jni/model_detect}/sherpa-onnx-model-detect.h +122 -89
  27. package/android/src/main/cpp/jni/model_detect/sherpa-onnx-stt-wrapper.cpp +99 -0
  28. package/android/src/main/cpp/jni/model_detect/sherpa-onnx-stt-wrapper.h +16 -0
  29. package/android/src/main/cpp/jni/model_detect/sherpa-onnx-tts-wrapper.cpp +78 -0
  30. package/android/src/main/cpp/jni/model_detect/sherpa-onnx-tts-wrapper.h +16 -0
  31. package/android/src/main/cpp/jni/module/sherpa-onnx-module-jni.cpp +190 -0
  32. package/android/src/main/cpp/jni/tts/sherpa-onnx-tts-zipvoice-jni.cpp +301 -0
  33. package/android/src/main/java/com/sherpaonnx/SherpaOnnxArchiveHelper.kt +94 -0
  34. package/android/src/main/java/com/sherpaonnx/{SherpaOnnxCoreHelper.kt → SherpaOnnxAssetHelper.kt} +350 -236
  35. package/android/src/main/java/com/sherpaonnx/SherpaOnnxModule.kt +791 -483
  36. package/android/src/main/java/com/sherpaonnx/SherpaOnnxSttHelper.kt +699 -109
  37. package/android/src/main/java/com/sherpaonnx/SherpaOnnxTtsHelper.kt +1123 -668
  38. package/android/src/main/java/com/sherpaonnx/ZipvoiceTtsWrapper.kt +187 -0
  39. package/ios/SherpaOnnx+Assets.h +11 -0
  40. package/ios/SherpaOnnx+Assets.mm +325 -0
  41. package/ios/SherpaOnnx+STT.mm +455 -118
  42. package/ios/SherpaOnnx+TTS.mm +1101 -712
  43. package/ios/SherpaOnnx.h +17 -6
  44. package/ios/SherpaOnnx.mm +206 -311
  45. package/ios/SherpaOnnx.xcconfig +19 -19
  46. package/ios/SherpaOnnxCoreMLHelper.swift +24 -0
  47. package/ios/archive/sherpa-onnx-archive-helper.h +21 -0
  48. package/ios/archive/sherpa-onnx-archive-helper.mm +296 -0
  49. package/ios/libarchive_darwin_config.h +153 -0
  50. package/{android/src/main/cpp/jni → ios/model_detect}/sherpa-onnx-common.h +18 -18
  51. package/ios/model_detect/sherpa-onnx-model-detect-helper.h +49 -0
  52. package/ios/model_detect/sherpa-onnx-model-detect-helper.mm +210 -0
  53. package/ios/model_detect/sherpa-onnx-model-detect-stt.mm +344 -0
  54. package/ios/model_detect/sherpa-onnx-model-detect-tts.mm +201 -0
  55. package/{android/src/main/cpp/jni → ios/model_detect}/sherpa-onnx-model-detect.h +117 -89
  56. package/ios/scripts/patch-libarchive-includes.sh +61 -0
  57. package/ios/scripts/setup-ios-libarchive.sh +98 -0
  58. package/ios/stt/sherpa-onnx-stt-wrapper.h +129 -0
  59. package/ios/stt/sherpa-onnx-stt-wrapper.mm +523 -0
  60. package/ios/{sherpa-onnx-tts-wrapper.h → tts/sherpa-onnx-tts-wrapper.h} +90 -85
  61. package/ios/{sherpa-onnx-tts-wrapper.mm → tts/sherpa-onnx-tts-wrapper.mm} +376 -345
  62. package/lib/module/NativeSherpaOnnx.js +3 -0
  63. package/lib/module/NativeSherpaOnnx.js.map +1 -1
  64. package/lib/module/audio/index.js +22 -0
  65. package/lib/module/audio/index.js.map +1 -0
  66. package/lib/module/diarization/index.js +1 -1
  67. package/lib/module/diarization/index.js.map +1 -1
  68. package/lib/module/download/ModelDownloadManager.js +918 -0
  69. package/lib/module/download/ModelDownloadManager.js.map +1 -0
  70. package/lib/module/download/extractTarBz2.js +53 -0
  71. package/lib/module/download/extractTarBz2.js.map +1 -0
  72. package/lib/module/download/index.js +6 -0
  73. package/lib/module/download/index.js.map +1 -0
  74. package/lib/module/download/validation.js +178 -0
  75. package/lib/module/download/validation.js.map +1 -0
  76. package/lib/module/enhancement/index.js +1 -1
  77. package/lib/module/enhancement/index.js.map +1 -1
  78. package/lib/module/index.js +41 -3
  79. package/lib/module/index.js.map +1 -1
  80. package/lib/module/separation/index.js +1 -1
  81. package/lib/module/separation/index.js.map +1 -1
  82. package/lib/module/stt/index.js +127 -60
  83. package/lib/module/stt/index.js.map +1 -1
  84. package/lib/module/stt/sttModelLanguages.js +512 -0
  85. package/lib/module/stt/sttModelLanguages.js.map +1 -0
  86. package/lib/module/stt/types.js +53 -1
  87. package/lib/module/stt/types.js.map +1 -1
  88. package/lib/module/tts/index.js +216 -289
  89. package/lib/module/tts/index.js.map +1 -1
  90. package/lib/module/tts/types.js +86 -1
  91. package/lib/module/tts/types.js.map +1 -1
  92. package/lib/module/types.js.map +1 -1
  93. package/lib/module/utils.js +86 -73
  94. package/lib/module/utils.js.map +1 -1
  95. package/lib/module/vad/index.js +1 -1
  96. package/lib/module/vad/index.js.map +1 -1
  97. package/lib/typescript/src/NativeSherpaOnnx.d.ts +192 -38
  98. package/lib/typescript/src/NativeSherpaOnnx.d.ts.map +1 -1
  99. package/lib/typescript/src/audio/index.d.ts +13 -0
  100. package/lib/typescript/src/audio/index.d.ts.map +1 -0
  101. package/lib/typescript/src/diarization/index.d.ts +3 -2
  102. package/lib/typescript/src/diarization/index.d.ts.map +1 -1
  103. package/lib/typescript/src/download/ModelDownloadManager.d.ts +108 -0
  104. package/lib/typescript/src/download/ModelDownloadManager.d.ts.map +1 -0
  105. package/lib/typescript/src/download/extractTarBz2.d.ts +14 -0
  106. package/lib/typescript/src/download/extractTarBz2.d.ts.map +1 -0
  107. package/lib/typescript/src/download/index.d.ts +7 -0
  108. package/lib/typescript/src/download/index.d.ts.map +1 -0
  109. package/lib/typescript/src/download/validation.d.ts +57 -0
  110. package/lib/typescript/src/download/validation.d.ts.map +1 -0
  111. package/lib/typescript/src/enhancement/index.d.ts +3 -2
  112. package/lib/typescript/src/enhancement/index.d.ts.map +1 -1
  113. package/lib/typescript/src/index.d.ts +26 -2
  114. package/lib/typescript/src/index.d.ts.map +1 -1
  115. package/lib/typescript/src/separation/index.d.ts +3 -2
  116. package/lib/typescript/src/separation/index.d.ts.map +1 -1
  117. package/lib/typescript/src/stt/index.d.ts +31 -43
  118. package/lib/typescript/src/stt/index.d.ts.map +1 -1
  119. package/lib/typescript/src/stt/sttModelLanguages.d.ts +52 -0
  120. package/lib/typescript/src/stt/sttModelLanguages.d.ts.map +1 -0
  121. package/lib/typescript/src/stt/types.d.ts +196 -9
  122. package/lib/typescript/src/stt/types.d.ts.map +1 -1
  123. package/lib/typescript/src/tts/index.d.ts +25 -211
  124. package/lib/typescript/src/tts/index.d.ts.map +1 -1
  125. package/lib/typescript/src/tts/types.d.ts +148 -25
  126. package/lib/typescript/src/tts/types.d.ts.map +1 -1
  127. package/lib/typescript/src/types.d.ts +0 -32
  128. package/lib/typescript/src/types.d.ts.map +1 -1
  129. package/lib/typescript/src/utils.d.ts +28 -13
  130. package/lib/typescript/src/utils.d.ts.map +1 -1
  131. package/lib/typescript/src/vad/index.d.ts +3 -2
  132. package/lib/typescript/src/vad/index.d.ts.map +1 -1
  133. package/package.json +250 -222
  134. package/scripts/check-qnn-support.sh +78 -0
  135. package/scripts/setup-ios-framework.sh +379 -282
  136. package/src/NativeSherpaOnnx.ts +474 -251
  137. package/src/audio/index.ts +32 -0
  138. package/src/diarization/index.ts +4 -2
  139. package/src/download/ModelDownloadManager.ts +1325 -0
  140. package/src/download/extractTarBz2.ts +78 -0
  141. package/src/download/index.ts +43 -0
  142. package/src/download/validation.ts +279 -0
  143. package/src/enhancement/index.ts +4 -2
  144. package/src/index.tsx +78 -27
  145. package/src/separation/index.ts +4 -2
  146. package/src/stt/index.ts +249 -89
  147. package/src/stt/sttModelLanguages.ts +237 -0
  148. package/src/stt/types.ts +263 -9
  149. package/src/tts/index.ts +470 -458
  150. package/src/tts/types.ts +373 -218
  151. package/src/types.ts +0 -44
  152. package/src/utils.ts +145 -131
  153. package/src/vad/index.ts +4 -2
  154. package/third_party/ffmpeg_prebuilt/ANDROID_RELEASE_TAG +1 -0
  155. package/third_party/libarchive_prebuilt/ANDROID_RELEASE_TAG +1 -0
  156. package/third_party/libarchive_prebuilt/IOS_RELEASE_TAG +1 -0
  157. package/third_party/sherpa-onnx-prebuilt/ANDROID_RELEASE_TAG +1 -0
  158. package/third_party/sherpa-onnx-prebuilt/IOS_RELEASE_TAG +1 -0
  159. package/android/src/main/cpp/include/sherpa-onnx/c-api/c-api.h +0 -1918
  160. package/android/src/main/cpp/include/sherpa-onnx/c-api/cxx-api.h +0 -841
  161. package/android/src/main/cpp/jni/sherpa-onnx-model-detect.cpp +0 -541
  162. package/android/src/main/cpp/jni/sherpa-onnx-stt-jni.cpp +0 -336
  163. package/android/src/main/cpp/jni/sherpa-onnx-stt-wrapper.cpp +0 -222
  164. package/android/src/main/cpp/jni/sherpa-onnx-stt-wrapper.h +0 -68
  165. package/android/src/main/cpp/jni/sherpa-onnx-tts-jni.cpp +0 -823
  166. package/android/src/main/cpp/jni/sherpa-onnx-tts-wrapper.cpp +0 -387
  167. package/android/src/main/cpp/jni/sherpa-onnx-tts-wrapper.h +0 -147
  168. package/ios/Frameworks/sherpa_onnx.xcframework.zip +0 -0
  169. package/ios/include/sherpa-onnx/c-api/c-api.h +0 -1918
  170. package/ios/include/sherpa-onnx/c-api/cxx-api.h +0 -841
  171. package/ios/sherpa-onnx-model-detect.mm +0 -441
  172. package/ios/sherpa-onnx-stt-wrapper.h +0 -48
  173. package/ios/sherpa-onnx-stt-wrapper.mm +0 -201
  174. package/scripts/copy-headers.js +0 -184
  175. package/scripts/setup-assets.js +0 -323
@@ -0,0 +1,399 @@
1
+ /**
2
+ * sherpa-onnx-model-detect-stt.cpp
3
+ *
4
+ * Purpose: Detects STT model type and fills SttModelPaths from a model directory. Supports
5
+ * transducer, paraformer, whisper, and other STT variants. Used by nativeDetectSttModel (module-jni).
6
+ */
7
+ #include "sherpa-onnx-model-detect.h"
8
+ #include "sherpa-onnx-model-detect-helper.h"
9
+ #include <android/log.h>
10
+ #include <cstdlib>
11
+ #include <string>
12
+ #include <algorithm>
13
+
14
+ #define LOG_TAG "SttModelDetect"
15
+ #define LOGI(...) __android_log_print(ANDROID_LOG_INFO, LOG_TAG, __VA_ARGS__)
16
+ #define LOGE(...) __android_log_print(ANDROID_LOG_ERROR, LOG_TAG, __VA_ARGS__)
17
+
18
+ namespace sherpaonnx {
19
+ namespace {
20
+
21
+ SttModelKind ParseSttModelType(const std::string& modelType) {
22
+ if (modelType == "transducer") return SttModelKind::kTransducer;
23
+ if (modelType == "nemo_transducer") return SttModelKind::kNemoTransducer;
24
+ if (modelType == "paraformer") return SttModelKind::kParaformer;
25
+ if (modelType == "nemo_ctc") return SttModelKind::kNemoCtc;
26
+ if (modelType == "wenet_ctc") return SttModelKind::kWenetCtc;
27
+ if (modelType == "sense_voice") return SttModelKind::kSenseVoice;
28
+ if (modelType == "zipformer_ctc" || modelType == "ctc") return SttModelKind::kZipformerCtc;
29
+ if (modelType == "whisper") return SttModelKind::kWhisper;
30
+ if (modelType == "funasr_nano") return SttModelKind::kFunAsrNano;
31
+ if (modelType == "fire_red_asr") return SttModelKind::kFireRedAsr;
32
+ if (modelType == "moonshine") return SttModelKind::kMoonshine;
33
+ if (modelType == "dolphin") return SttModelKind::kDolphin;
34
+ if (modelType == "canary") return SttModelKind::kCanary;
35
+ if (modelType == "omnilingual") return SttModelKind::kOmnilingual;
36
+ if (modelType == "medasr") return SttModelKind::kMedAsr;
37
+ if (modelType == "telespeech_ctc") return SttModelKind::kTeleSpeechCtc;
38
+ return SttModelKind::kUnknown;
39
+ }
40
+
41
+ } // namespace
42
+
43
+ SttDetectResult DetectSttModel(
44
+ const std::string& modelDir,
45
+ const std::optional<bool>& preferInt8,
46
+ const std::optional<std::string>& modelType,
47
+ bool debug /* = false */
48
+ ) {
49
+ using namespace model_detect;
50
+
51
+ SttDetectResult result;
52
+
53
+ LOGI("DetectSttModel: modelDir=%s, modelType=%s, preferInt8=%s",
54
+ modelDir.c_str(),
55
+ modelType.has_value() ? modelType->c_str() : "auto",
56
+ preferInt8.has_value() ? (preferInt8.value() ? "true" : "false") : "unset");
57
+
58
+ if (modelDir.empty()) {
59
+ result.error = "Model directory is empty";
60
+ LOGE("%s", result.error.c_str());
61
+ return result;
62
+ }
63
+
64
+ if (!FileExists(modelDir) || !IsDirectory(modelDir)) {
65
+ result.error = "Model directory does not exist or is not a directory: " + modelDir;
66
+ LOGE("%s", result.error.c_str());
67
+ return result;
68
+ }
69
+
70
+ // Depth 4 supports layouts like root/data/lang_bpe_500/tokens.txt (icefall, k2)
71
+ const int kMaxSearchDepth = 4;
72
+ const auto files = ListFilesRecursive(modelDir, kMaxSearchDepth);
73
+ bool verbose = debug;
74
+ LOGI("DetectSttModel: Found %zu files in %s (verbose=%d)", files.size(), modelDir.c_str(), (int)verbose);
75
+ if (verbose) {
76
+ for (const auto& f : files) {
77
+ LOGI(" file: %s (size=%llu)", f.path.c_str(), (unsigned long long)f.size);
78
+ }
79
+ } else {
80
+ LOGI("(detailed file listing suppressed; enable by passing debug=true to initialize())");
81
+ }
82
+
83
+ std::string encoderPath = FindOnnxByAnyToken(files, {"encoder"}, preferInt8);
84
+ std::string decoderPath = FindOnnxByAnyToken(files, {"decoder"}, preferInt8);
85
+ std::string joinerPath = FindOnnxByAnyToken(files, {"joiner"}, preferInt8);
86
+
87
+ LOGI("DetectSttModel: encoder=%s, decoder=%s, joiner=%s",
88
+ encoderPath.c_str(), decoderPath.c_str(), joinerPath.c_str());
89
+
90
+ std::string funasrEncoderAdaptor = FindOnnxByAnyToken(files, {"encoder_adaptor", "encoder-adaptor"}, preferInt8);
91
+ std::string funasrLLM = FindOnnxByAnyToken(files, {"llm"}, preferInt8);
92
+ std::string funasrEmbedding = FindOnnxByAnyToken(files, {"embedding"}, preferInt8);
93
+
94
+ std::string funasrTokenizerDir = ResolveTokenizerDir(modelDir);
95
+
96
+ // Moonshine: preprocess, encode, uncached_decode, cached_decode
97
+ std::string moonshinePreprocessor = FindOnnxByAnyToken(files, {"preprocess", "preprocessor"}, preferInt8);
98
+ std::string moonshineEncoder = FindOnnxByAnyToken(files, {"encode"}, preferInt8);
99
+ std::string moonshineUncachedDecoder = FindOnnxByAnyToken(files, {"uncached_decode", "uncached"}, preferInt8);
100
+ std::string moonshineCachedDecoder = FindOnnxByAnyToken(files, {"cached_decode", "cached"}, preferInt8);
101
+
102
+ std::vector<std::string> modelExcludes = {
103
+ "encoder",
104
+ "decoder",
105
+ "joiner",
106
+ "vocoder",
107
+ "acoustic",
108
+ "embedding",
109
+ "llm",
110
+ "encoder_adaptor",
111
+ "encoder-adaptor"
112
+ };
113
+
114
+ std::string paraformerModelPath = FindOnnxByAnyToken(files, {"model"}, preferInt8);
115
+ if (paraformerModelPath.empty()) {
116
+ paraformerModelPath = FindLargestOnnxExcludingTokens(files, modelExcludes);
117
+ }
118
+
119
+ std::string ctcModelPath = FindOnnxByAnyToken(files, {"model"}, preferInt8);
120
+ if (ctcModelPath.empty()) {
121
+ ctcModelPath = FindLargestOnnxExcludingTokens(files, modelExcludes);
122
+ }
123
+
124
+ // Search for tokens file: first try exact "tokens.txt", then suffix match
125
+ // (e.g. "tiny-tokens.txt" for Whisper models). Use same depth as file list
126
+ // so layouts like root/data/lang_bpe_500/tokens.txt (icefall) are found.
127
+ std::string tokensPath = FindFileEndingWith(modelDir, "tokens.txt", kMaxSearchDepth);
128
+ LOGI("DetectSttModel: tokens=%s", tokensPath.c_str());
129
+
130
+ // Optional: BPE vocabulary for hotwords (sentencepiece bpe.vocab). Used when modeling_unit is bpe or cjkchar+bpe.
131
+ std::string bpeVocabPath = FindFileByName(modelDir, "bpe.vocab", kMaxSearchDepth);
132
+ if (!bpeVocabPath.empty()) {
133
+ LOGI("DetectSttModel: bpeVocab=%s", bpeVocabPath.c_str());
134
+ }
135
+
136
+ bool hasTransducer = !encoderPath.empty() && !decoderPath.empty() && !joinerPath.empty();
137
+
138
+ bool hasWhisperEncoder = !encoderPath.empty();
139
+ bool hasWhisperDecoder = !decoderPath.empty();
140
+ bool hasWhisper = hasWhisperEncoder && hasWhisperDecoder && joinerPath.empty();
141
+
142
+ bool hasFunAsrEncoderAdaptor = !funasrEncoderAdaptor.empty();
143
+ bool hasFunAsrLLM = !funasrLLM.empty();
144
+ bool hasFunAsrEmbedding = !funasrEmbedding.empty();
145
+ bool hasFunAsrTokenizer = !funasrTokenizerDir.empty() && FileExists(funasrTokenizerDir + "/vocab.json");
146
+ bool hasFunAsrNano = hasFunAsrEncoderAdaptor && hasFunAsrLLM && hasFunAsrEmbedding && hasFunAsrTokenizer;
147
+
148
+ // Case-insensitive path hints so "Nemo parakeet Tdt CTC 110m EN" etc. are recognized
149
+ std::string modelDirLower = model_detect::ToLower(modelDir);
150
+ bool isLikelyNemo = modelDirLower.find("nemo") != std::string::npos ||
151
+ modelDirLower.find("parakeet") != std::string::npos;
152
+ bool isLikelyTdt = modelDirLower.find("tdt") != std::string::npos;
153
+ bool isLikelyWenetCtc = modelDirLower.find("wenet") != std::string::npos;
154
+ bool isLikelySenseVoice = modelDirLower.find("sense") != std::string::npos ||
155
+ modelDirLower.find("sensevoice") != std::string::npos;
156
+ bool isLikelyFunAsrNano = modelDirLower.find("funasr") != std::string::npos ||
157
+ modelDirLower.find("funasr-nano") != std::string::npos;
158
+ bool isLikelyMoonshine = modelDirLower.find("moonshine") != std::string::npos;
159
+ bool isLikelyDolphin = modelDirLower.find("dolphin") != std::string::npos;
160
+ bool isLikelyFireRedAsr = modelDirLower.find("fire_red") != std::string::npos ||
161
+ modelDirLower.find("fire-red") != std::string::npos;
162
+ bool isLikelyCanary = modelDirLower.find("canary") != std::string::npos;
163
+ bool isLikelyOmnilingual = modelDirLower.find("omnilingual") != std::string::npos;
164
+ bool isLikelyMedAsr = modelDirLower.find("medasr") != std::string::npos;
165
+ bool isLikelyTeleSpeech = modelDirLower.find("telespeech") != std::string::npos;
166
+
167
+ bool hasMoonshine = !moonshinePreprocessor.empty() && !moonshineUncachedDecoder.empty() &&
168
+ !moonshineCachedDecoder.empty() && !moonshineEncoder.empty();
169
+ bool hasDolphin = isLikelyDolphin && !ctcModelPath.empty();
170
+ bool hasFireRedAsr = hasTransducer && isLikelyFireRedAsr;
171
+ // Canary (NeMo Canary) uses encoder + decoder without joiner; same file pattern as Whisper but path contains "canary"
172
+ bool hasCanary = hasWhisperEncoder && hasWhisperDecoder && joinerPath.empty() && isLikelyCanary;
173
+ bool hasOmnilingual = !ctcModelPath.empty() && isLikelyOmnilingual;
174
+ bool hasMedAsr = !ctcModelPath.empty() && isLikelyMedAsr;
175
+ bool hasTeleSpeechCtc = (!ctcModelPath.empty() || !paraformerModelPath.empty()) && isLikelyTeleSpeech;
176
+
177
+ if (hasTransducer) {
178
+ if (isLikelyNemo || isLikelyTdt) {
179
+ result.detectedModels.push_back({"nemo_transducer", modelDir});
180
+ } else {
181
+ result.detectedModels.push_back({"transducer", modelDir});
182
+ }
183
+ }
184
+
185
+ if (!ctcModelPath.empty() && (isLikelyNemo || isLikelyWenetCtc || isLikelySenseVoice)) {
186
+ if (isLikelyNemo) {
187
+ result.detectedModels.push_back({"nemo_ctc", modelDir});
188
+ } else if (isLikelyWenetCtc) {
189
+ result.detectedModels.push_back({"wenet_ctc", modelDir});
190
+ } else if (isLikelySenseVoice) {
191
+ result.detectedModels.push_back({"sense_voice", modelDir});
192
+ } else {
193
+ result.detectedModels.push_back({"ctc", modelDir});
194
+ }
195
+ } else if (!paraformerModelPath.empty()) {
196
+ result.detectedModels.push_back({"paraformer", modelDir});
197
+ }
198
+
199
+ if (hasWhisper) {
200
+ result.detectedModels.push_back({"whisper", modelDir});
201
+ }
202
+
203
+ if (hasFunAsrNano) {
204
+ result.detectedModels.push_back({"funasr_nano", modelDir});
205
+ }
206
+ if (hasMoonshine) {
207
+ result.detectedModels.push_back({"moonshine", modelDir});
208
+ }
209
+ if (hasDolphin) {
210
+ result.detectedModels.push_back({"dolphin", modelDir});
211
+ }
212
+ if (hasFireRedAsr) {
213
+ result.detectedModels.push_back({"fire_red_asr", modelDir});
214
+ }
215
+ if (hasCanary) {
216
+ result.detectedModels.push_back({"canary", modelDir});
217
+ }
218
+ if (hasOmnilingual) {
219
+ result.detectedModels.push_back({"omnilingual", modelDir});
220
+ }
221
+ if (hasMedAsr) {
222
+ result.detectedModels.push_back({"medasr", modelDir});
223
+ }
224
+ if (hasTeleSpeechCtc) {
225
+ result.detectedModels.push_back({"telespeech_ctc", modelDir});
226
+ }
227
+
228
+ SttModelKind selected = SttModelKind::kUnknown;
229
+
230
+ if (modelType.has_value() && modelType.value() != "auto") {
231
+ selected = ParseSttModelType(modelType.value());
232
+ if (selected == SttModelKind::kUnknown) {
233
+ result.error = "Unknown model type: " + modelType.value();
234
+ return result;
235
+ }
236
+
237
+ if (selected == SttModelKind::kTransducer && !hasTransducer) {
238
+ result.error = "Transducer model requested but files not found in " + modelDir;
239
+ return result;
240
+ }
241
+ if (selected == SttModelKind::kNemoTransducer && !hasTransducer) {
242
+ result.error = "NeMo Transducer model requested but encoder/decoder/joiner not found in " + modelDir;
243
+ return result;
244
+ }
245
+ if (selected == SttModelKind::kParaformer && paraformerModelPath.empty()) {
246
+ result.error = "Paraformer model requested but model file not found in " + modelDir;
247
+ return result;
248
+ }
249
+ if ((selected == SttModelKind::kNemoCtc || selected == SttModelKind::kWenetCtc ||
250
+ selected == SttModelKind::kSenseVoice || selected == SttModelKind::kZipformerCtc) &&
251
+ ctcModelPath.empty()) {
252
+ result.error = "CTC model requested but model file not found in " + modelDir;
253
+ return result;
254
+ }
255
+ if (selected == SttModelKind::kWhisper && !hasWhisper) {
256
+ result.error = "Whisper model requested but encoder/decoder not found in " + modelDir;
257
+ return result;
258
+ }
259
+ if (selected == SttModelKind::kFunAsrNano && !hasFunAsrNano) {
260
+ result.error = "FunASR Nano model requested but required files not found in " + modelDir;
261
+ return result;
262
+ }
263
+ if (selected == SttModelKind::kMoonshine && !hasMoonshine) {
264
+ result.error = "Moonshine model requested but preprocess/encode/uncached_decode/cached_decode not found in " + modelDir;
265
+ return result;
266
+ }
267
+ if (selected == SttModelKind::kDolphin && !hasDolphin) {
268
+ result.error = "Dolphin model requested but model not found in " + modelDir;
269
+ return result;
270
+ }
271
+ if (selected == SttModelKind::kFireRedAsr && !hasFireRedAsr) {
272
+ result.error = "FireRed ASR model requested but encoder/decoder not found in " + modelDir;
273
+ return result;
274
+ }
275
+ if (selected == SttModelKind::kCanary && !hasCanary) {
276
+ result.error = "Canary model requested but encoder/decoder not found in " + modelDir;
277
+ return result;
278
+ }
279
+ if (selected == SttModelKind::kOmnilingual && !hasOmnilingual) {
280
+ result.error = "Omnilingual model requested but model not found in " + modelDir;
281
+ return result;
282
+ }
283
+ if (selected == SttModelKind::kMedAsr && !hasMedAsr) {
284
+ result.error = "MedASR model requested but model not found in " + modelDir;
285
+ return result;
286
+ }
287
+ if (selected == SttModelKind::kTeleSpeechCtc && !hasTeleSpeechCtc) {
288
+ result.error = "TeleSpeech CTC model requested but model not found in " + modelDir;
289
+ return result;
290
+ }
291
+ } else {
292
+ if (hasTransducer) {
293
+ selected = (isLikelyNemo || isLikelyTdt) ? SttModelKind::kNemoTransducer : SttModelKind::kTransducer;
294
+ } else if (!ctcModelPath.empty() && (isLikelyNemo || isLikelyWenetCtc || isLikelySenseVoice)) {
295
+ if (isLikelyNemo) {
296
+ selected = SttModelKind::kNemoCtc;
297
+ } else if (isLikelyWenetCtc) {
298
+ selected = SttModelKind::kWenetCtc;
299
+ } else {
300
+ selected = SttModelKind::kSenseVoice;
301
+ }
302
+ } else if (hasFunAsrNano && isLikelyFunAsrNano) {
303
+ selected = SttModelKind::kFunAsrNano;
304
+ } else if (!paraformerModelPath.empty()) {
305
+ selected = SttModelKind::kParaformer;
306
+ } else if (hasCanary) {
307
+ selected = SttModelKind::kCanary;
308
+ } else if (hasFireRedAsr) {
309
+ selected = SttModelKind::kFireRedAsr;
310
+ } else if (hasWhisper) {
311
+ selected = SttModelKind::kWhisper;
312
+ } else if (hasFunAsrNano) {
313
+ selected = SttModelKind::kFunAsrNano;
314
+ } else if (hasMoonshine && isLikelyMoonshine) {
315
+ selected = SttModelKind::kMoonshine;
316
+ } else if (hasDolphin) {
317
+ selected = SttModelKind::kDolphin;
318
+ } else if (hasOmnilingual) {
319
+ selected = SttModelKind::kOmnilingual;
320
+ } else if (hasMedAsr) {
321
+ selected = SttModelKind::kMedAsr;
322
+ } else if (hasTeleSpeechCtc) {
323
+ selected = SttModelKind::kTeleSpeechCtc;
324
+ } else if (!ctcModelPath.empty()) {
325
+ selected = SttModelKind::kZipformerCtc;
326
+ }
327
+ }
328
+
329
+ if (selected == SttModelKind::kUnknown) {
330
+ result.error = "No compatible model type detected in " + modelDir;
331
+ LOGE("%s", result.error.c_str());
332
+ return result;
333
+ }
334
+
335
+ LOGI("DetectSttModel: selected kind=%d", static_cast<int>(selected));
336
+ result.selectedKind = selected;
337
+ // sherpa-onnx's OfflineModelConfig::Validate() requires tokens for ALL models
338
+ // except FunASR-nano (which uses its own tokenizer directory).
339
+ // Whisper models also need tokens.txt despite seeming self-contained.
340
+ result.tokensRequired = (selected != SttModelKind::kFunAsrNano);
341
+
342
+ if (selected == SttModelKind::kTransducer || selected == SttModelKind::kNemoTransducer) {
343
+ result.paths.encoder = encoderPath;
344
+ result.paths.decoder = decoderPath;
345
+ result.paths.joiner = joinerPath;
346
+ } else if (selected == SttModelKind::kParaformer) {
347
+ result.paths.paraformerModel = paraformerModelPath;
348
+ } else if (selected == SttModelKind::kNemoCtc || selected == SttModelKind::kWenetCtc ||
349
+ selected == SttModelKind::kSenseVoice || selected == SttModelKind::kZipformerCtc) {
350
+ result.paths.ctcModel = ctcModelPath;
351
+ } else if (selected == SttModelKind::kWhisper) {
352
+ result.paths.whisperEncoder = encoderPath;
353
+ result.paths.whisperDecoder = decoderPath;
354
+ } else if (selected == SttModelKind::kFunAsrNano) {
355
+ result.paths.funasrEncoderAdaptor = funasrEncoderAdaptor;
356
+ result.paths.funasrLLM = funasrLLM;
357
+ result.paths.funasrEmbedding = funasrEmbedding;
358
+ // FunASR Nano C++ expects tokenizer directory (e.g. .../Qwen3-0.6B), not path to vocab.json
359
+ result.paths.funasrTokenizer = funasrTokenizerDir;
360
+ } else if (selected == SttModelKind::kMoonshine) {
361
+ result.paths.moonshinePreprocessor = moonshinePreprocessor;
362
+ result.paths.moonshineEncoder = moonshineEncoder;
363
+ result.paths.moonshineUncachedDecoder = moonshineUncachedDecoder;
364
+ result.paths.moonshineCachedDecoder = moonshineCachedDecoder;
365
+ } else if (selected == SttModelKind::kDolphin) {
366
+ result.paths.dolphinModel = ctcModelPath.empty() ? paraformerModelPath : ctcModelPath;
367
+ } else if (selected == SttModelKind::kFireRedAsr) {
368
+ result.paths.fireRedEncoder = encoderPath;
369
+ result.paths.fireRedDecoder = decoderPath;
370
+ } else if (selected == SttModelKind::kCanary) {
371
+ result.paths.canaryEncoder = encoderPath;
372
+ result.paths.canaryDecoder = decoderPath;
373
+ } else if (selected == SttModelKind::kOmnilingual) {
374
+ result.paths.omnilingualModel = ctcModelPath;
375
+ } else if (selected == SttModelKind::kMedAsr) {
376
+ result.paths.medasrModel = ctcModelPath;
377
+ } else if (selected == SttModelKind::kTeleSpeechCtc) {
378
+ result.paths.telespeechCtcModel = ctcModelPath.empty() ? paraformerModelPath : ctcModelPath;
379
+ }
380
+
381
+ if (!tokensPath.empty() && FileExists(tokensPath)) {
382
+ result.paths.tokens = tokensPath;
383
+ } else if (result.tokensRequired) {
384
+ result.error = "Tokens file not found in " + modelDir;
385
+ LOGE("%s", result.error.c_str());
386
+ return result;
387
+ }
388
+
389
+ if (!bpeVocabPath.empty() && FileExists(bpeVocabPath)) {
390
+ result.paths.bpeVocab = bpeVocabPath;
391
+ }
392
+
393
+ LOGI("DetectSttModel: detection OK for %s — tokens=%s",
394
+ modelDir.c_str(), result.paths.tokens.c_str());
395
+ result.ok = true;
396
+ return result;
397
+ }
398
+
399
+ } // namespace sherpaonnx
@@ -0,0 +1,238 @@
1
+ /**
2
+ * sherpa-onnx-model-detect-tts.cpp
3
+ *
4
+ * Purpose: Detects TTS model type and fills TtsModelPaths from a model directory. Supports Vits,
5
+ * Piper, Kokoro, Zipvoice, Pocket, etc. Used by nativeDetectTtsModel (module-jni).
6
+ */
7
+ #include "sherpa-onnx-model-detect.h"
8
+ #include "sherpa-onnx-model-detect-helper.h"
9
+ #include <android/log.h>
10
+
11
+ #define LOG_TAG "TtsModelDetect"
12
+ #define LOGI(...) __android_log_print(ANDROID_LOG_INFO, LOG_TAG, __VA_ARGS__)
13
+ #define LOGE(...) __android_log_print(ANDROID_LOG_ERROR, LOG_TAG, __VA_ARGS__)
14
+
15
+ namespace sherpaonnx {
16
+ namespace {
17
+
18
+ TtsModelKind ParseTtsModelType(const std::string& modelType) {
19
+ if (modelType == "vits") return TtsModelKind::kVits;
20
+ if (modelType == "matcha") return TtsModelKind::kMatcha;
21
+ if (modelType == "kokoro") return TtsModelKind::kKokoro;
22
+ if (modelType == "kitten") return TtsModelKind::kKitten;
23
+ if (modelType == "pocket") return TtsModelKind::kPocket;
24
+ if (modelType == "zipvoice") return TtsModelKind::kZipvoice;
25
+ return TtsModelKind::kUnknown;
26
+ }
27
+
28
+ } // namespace
29
+
30
+ TtsDetectResult DetectTtsModel(const std::string& modelDir, const std::string& modelType) {
31
+ using namespace model_detect;
32
+
33
+ TtsDetectResult result;
34
+
35
+ LOGI("DetectTtsModel: modelDir=%s, modelType=%s", modelDir.c_str(), modelType.c_str());
36
+
37
+ if (modelDir.empty()) {
38
+ result.error = "TTS: Model directory is empty";
39
+ LOGE("%s", result.error.c_str());
40
+ return result;
41
+ }
42
+
43
+ if (!FileExists(modelDir) || !IsDirectory(modelDir)) {
44
+ result.error = "TTS: Model directory does not exist or is not a directory: " + modelDir;
45
+ LOGE("%s", result.error.c_str());
46
+ return result;
47
+ }
48
+
49
+ const auto files = ListFilesRecursive(modelDir, 4);
50
+ LOGI("DetectTtsModel: Found %zu files in %s", files.size(), modelDir.c_str());
51
+ for (const auto& f : files) {
52
+ LOGI(" file: %s (size=%llu)", f.path.c_str(), (unsigned long long)f.size);
53
+ }
54
+
55
+ std::string tokensFile = FindFileByName(modelDir, "tokens.txt", 2);
56
+ std::string lexiconFile = FindFileByName(modelDir, "lexicon.txt", 2);
57
+ std::string dataDirPath = FindDirectoryByName(modelDir, "espeak-ng-data", 2);
58
+ std::string voicesFile = FindFileByName(modelDir, "voices.bin", 2);
59
+
60
+ LOGI("DetectTtsModel: tokens=%s, lexicon=%s, dataDir=%s, voices=%s",
61
+ tokensFile.c_str(), lexiconFile.c_str(), dataDirPath.c_str(), voicesFile.c_str());
62
+
63
+ std::string acousticModel = FindOnnxByAnyToken(files, {"acoustic_model", "acoustic-model"}, std::nullopt);
64
+ // Note: matches either a "vocoder" or "vocos" ONNX file; both are stored in this field.
65
+ std::string vocoder = FindOnnxByAnyToken(files, {"vocoder", "vocos"}, std::nullopt);
66
+ std::string encoder = FindOnnxByAnyToken(files, {"encoder"}, std::nullopt);
67
+ std::string decoder = FindOnnxByAnyToken(files, {"decoder"}, std::nullopt);
68
+ std::string lmFlow = FindOnnxByAnyToken(files, {"lm_flow", "lm-flow"}, std::nullopt);
69
+ std::string lmMain = FindOnnxByAnyToken(files, {"lm_main", "lm-main"}, std::nullopt);
70
+ std::string textConditioner = FindOnnxByAnyToken(files, {"text_conditioner", "text-conditioner"}, std::nullopt);
71
+ std::string vocabJsonFile = FindFileByName(modelDir, "vocab.json", 2);
72
+ std::string tokenScoresJsonFile = FindFileByName(modelDir, "token_scores.json", 2);
73
+
74
+ LOGI("DetectTtsModel: acousticModel=%s, vocoder=%s, encoder=%s, decoder=%s",
75
+ acousticModel.c_str(), vocoder.c_str(), encoder.c_str(), decoder.c_str());
76
+ LOGI("DetectTtsModel: lmFlow=%s, lmMain=%s, textConditioner=%s, vocabJson=%s, tokenScoresJson=%s",
77
+ lmFlow.c_str(), lmMain.c_str(), textConditioner.c_str(), vocabJsonFile.c_str(), tokenScoresJsonFile.c_str());
78
+
79
+ std::vector<std::string> modelExcludes = {
80
+ "acoustic",
81
+ "vocoder",
82
+ "encoder",
83
+ "decoder",
84
+ "joiner"
85
+ };
86
+
87
+ std::string ttsModel = FindOnnxByAnyToken(files, {"model"}, std::nullopt);
88
+ if (ttsModel.empty()) {
89
+ ttsModel = FindLargestOnnxExcludingTokens(files, modelExcludes);
90
+ }
91
+ LOGI("DetectTtsModel: ttsModel=%s", ttsModel.c_str());
92
+
93
+ bool hasVits = !ttsModel.empty();
94
+ bool hasMatcha = !acousticModel.empty() && !vocoder.empty();
95
+ bool hasVoicesFile = !voicesFile.empty() && FileExists(voicesFile);
96
+ // Zipvoice requires encoder + decoder + vocoder (full model). Distill variants (no vocoder) are not supported by the native layer.
97
+ bool hasZipvoice = !encoder.empty() && !decoder.empty() && !vocoder.empty();
98
+ bool hasPocket = !lmFlow.empty() && !lmMain.empty() && !encoder.empty() && !decoder.empty() &&
99
+ !textConditioner.empty() && !vocabJsonFile.empty() && FileExists(vocabJsonFile) &&
100
+ !tokenScoresJsonFile.empty() && FileExists(tokenScoresJsonFile);
101
+ bool hasDataDir = !dataDirPath.empty() && IsDirectory(dataDirPath);
102
+
103
+ std::string modelDirLower = ToLower(modelDir);
104
+ bool isLikelyKitten = modelDirLower.find("kitten") != std::string::npos;
105
+ bool isLikelyKokoro = modelDirLower.find("kokoro") != std::string::npos;
106
+
107
+ if (hasMatcha) {
108
+ result.detectedModels.push_back({"matcha", modelDir});
109
+ }
110
+ if (hasPocket) {
111
+ result.detectedModels.push_back({"pocket", modelDir});
112
+ }
113
+ if (hasZipvoice && !hasMatcha) {
114
+ result.detectedModels.push_back({"zipvoice", modelDir});
115
+ }
116
+ if (hasVoicesFile) {
117
+ if (isLikelyKitten && !isLikelyKokoro) {
118
+ result.detectedModels.push_back({"kitten", modelDir});
119
+ } else if (isLikelyKokoro && !isLikelyKitten) {
120
+ result.detectedModels.push_back({"kokoro", modelDir});
121
+ } else {
122
+ result.detectedModels.push_back({"kokoro", modelDir});
123
+ result.detectedModels.push_back({"kitten", modelDir});
124
+ }
125
+ }
126
+
127
+ if (hasVits) {
128
+ bool isLikelyVits = modelDirLower.find("vits") != std::string::npos;
129
+ bool voicesAmbiguous = !isLikelyKitten && !isLikelyKokoro;
130
+
131
+ bool addVits = false;
132
+ if (!hasVoicesFile) {
133
+ addVits = true;
134
+ } else {
135
+ if (isLikelyVits || voicesAmbiguous) {
136
+ addVits = true;
137
+ }
138
+ }
139
+
140
+ if (addVits) {
141
+ result.detectedModels.push_back({"vits", modelDir});
142
+ }
143
+ }
144
+
145
+ TtsModelKind selected = TtsModelKind::kUnknown;
146
+ if (modelType != "auto") {
147
+ selected = ParseTtsModelType(modelType);
148
+ if (selected == TtsModelKind::kUnknown) {
149
+ result.error = "TTS: Unknown model type: " + modelType;
150
+ return result;
151
+ }
152
+ } else {
153
+ if (hasMatcha) {
154
+ selected = TtsModelKind::kMatcha;
155
+ } else if (hasPocket) {
156
+ selected = TtsModelKind::kPocket;
157
+ } else if (hasZipvoice) {
158
+ selected = TtsModelKind::kZipvoice;
159
+ } else if (hasVoicesFile) {
160
+ if (isLikelyKitten && !isLikelyKokoro) {
161
+ selected = TtsModelKind::kKitten;
162
+ } else if (isLikelyKokoro && !isLikelyKitten) {
163
+ selected = TtsModelKind::kKokoro;
164
+ } else {
165
+ selected = TtsModelKind::kKokoro;
166
+ }
167
+ } else if (hasVits) {
168
+ selected = TtsModelKind::kVits;
169
+ }
170
+ }
171
+
172
+ if (selected == TtsModelKind::kUnknown) {
173
+ result.error = "TTS: No compatible model type detected in " + modelDir;
174
+ return result;
175
+ }
176
+
177
+ if (selected == TtsModelKind::kVits && !hasVits) {
178
+ result.error = "TTS: VITS model requested but model file not found in " + modelDir;
179
+ return result;
180
+ }
181
+ if (selected == TtsModelKind::kMatcha && !hasMatcha) {
182
+ result.error = "TTS: Matcha model requested but required files not found in " + modelDir;
183
+ return result;
184
+ }
185
+ if ((selected == TtsModelKind::kKokoro || selected == TtsModelKind::kKitten) && (!hasVits || !hasVoicesFile)) {
186
+ result.error = "TTS: Kokoro/Kitten model requested but required files not found in " + modelDir;
187
+ return result;
188
+ }
189
+ if (selected == TtsModelKind::kPocket && !hasPocket) {
190
+ result.error = "TTS: Pocket model requested but required files not found in " + modelDir;
191
+ return result;
192
+ }
193
+ if (selected == TtsModelKind::kZipvoice && !hasZipvoice) {
194
+ result.error = "TTS: Zipvoice model requested but required files not found in " + modelDir;
195
+ return result;
196
+ }
197
+ if ((selected == TtsModelKind::kVits || selected == TtsModelKind::kMatcha ||
198
+ selected == TtsModelKind::kKokoro || selected == TtsModelKind::kKitten ||
199
+ selected == TtsModelKind::kZipvoice) &&
200
+ !hasDataDir) {
201
+ result.error = "TTS: espeak-ng-data not found in " + modelDir +
202
+ ". Copy espeak-ng-data into the model directory.";
203
+ return result;
204
+ }
205
+
206
+ result.selectedKind = selected;
207
+ result.paths.ttsModel = ttsModel;
208
+ result.paths.tokens = tokensFile;
209
+ result.paths.lexicon = !lexiconFile.empty() && FileExists(lexiconFile) ? lexiconFile : "";
210
+ result.paths.dataDir = dataDirPath;
211
+ result.paths.voices = voicesFile;
212
+ result.paths.acousticModel = acousticModel;
213
+ result.paths.vocoder = vocoder;
214
+ result.paths.encoder = encoder;
215
+ result.paths.decoder = decoder;
216
+ result.paths.lmFlow = lmFlow;
217
+ result.paths.lmMain = lmMain;
218
+ result.paths.textConditioner = textConditioner;
219
+ result.paths.vocabJson = vocabJsonFile;
220
+ result.paths.tokenScoresJson = tokenScoresJsonFile;
221
+
222
+ LOGI("DetectTtsModel: selected kind=%d, ttsModel=%s",
223
+ static_cast<int>(selected), ttsModel.c_str());
224
+ LOGI("DetectTtsModel: final paths — tokens=%s, dataDir=%s",
225
+ result.paths.tokens.c_str(), result.paths.dataDir.c_str());
226
+
227
+ if (selected != TtsModelKind::kPocket && (tokensFile.empty() || !FileExists(tokensFile))) {
228
+ result.error = "TTS: tokens.txt not found in " + modelDir;
229
+ LOGE("%s", result.error.c_str());
230
+ return result;
231
+ }
232
+
233
+ result.ok = true;
234
+ LOGI("DetectTtsModel: detection OK for %s", modelDir.c_str());
235
+ return result;
236
+ }
237
+
238
+ } // namespace sherpaonnx