react-native-sherpa-onnx 0.3.2 → 0.3.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (83) hide show
  1. package/README.md +84 -77
  2. package/SherpaOnnx.podspec +79 -45
  3. package/android/build.gradle +8 -2
  4. package/android/prebuilt-download.gradle +70 -16
  5. package/android/prebuilt-versions.gradle +14 -6
  6. package/android/src/main/cpp/CMakeLists.txt +2 -0
  7. package/android/src/main/cpp/jni/audio/sherpa-onnx-audio-convert-jni.cpp +202 -328
  8. package/android/src/main/cpp/jni/model_detect/sherpa-onnx-detect-jni-common.cpp +22 -0
  9. package/android/src/main/cpp/jni/model_detect/sherpa-onnx-detect-jni-common.h +2 -0
  10. package/android/src/main/cpp/jni/model_detect/sherpa-onnx-model-detect-helper.cpp +96 -142
  11. package/android/src/main/cpp/jni/model_detect/sherpa-onnx-model-detect-helper.h +40 -4
  12. package/android/src/main/cpp/jni/model_detect/sherpa-onnx-model-detect-stt.cpp +774 -316
  13. package/android/src/main/cpp/jni/model_detect/sherpa-onnx-model-detect-tts.cpp +208 -122
  14. package/android/src/main/cpp/jni/model_detect/sherpa-onnx-model-detect.h +92 -0
  15. package/android/src/main/cpp/jni/model_detect/sherpa-onnx-stt-wrapper.cpp +3 -0
  16. package/android/src/main/cpp/jni/model_detect/sherpa-onnx-tts-wrapper.cpp +14 -2
  17. package/android/src/main/cpp/jni/model_detect/sherpa-onnx-validate-stt.cpp +229 -0
  18. package/android/src/main/cpp/jni/model_detect/sherpa-onnx-validate-stt.h +38 -0
  19. package/android/src/main/cpp/jni/model_detect/sherpa-onnx-validate-tts.cpp +144 -0
  20. package/android/src/main/cpp/jni/model_detect/sherpa-onnx-validate-tts.h +38 -0
  21. package/android/src/main/cpp/jni/module/sherpa-onnx-module-jni.cpp +1 -1
  22. package/android/src/main/java/com/sherpaonnx/SherpaOnnxModule.kt +157 -11
  23. package/android/src/main/java/com/sherpaonnx/SherpaOnnxPcmCapture.kt +150 -0
  24. package/android/src/main/java/com/sherpaonnx/SherpaOnnxSttHelper.kt +75 -24
  25. package/android/src/main/java/com/sherpaonnx/SherpaOnnxTtsHelper.kt +52 -1
  26. package/ios/SherpaOnnx+PcmLiveStream.mm +288 -0
  27. package/ios/SherpaOnnx+STT.mm +2 -0
  28. package/ios/SherpaOnnx+TTS.mm +17 -0
  29. package/ios/SherpaOnnx.mm +27 -3
  30. package/ios/SherpaOnnxAudioConvert.h +28 -0
  31. package/ios/SherpaOnnxAudioConvert.mm +698 -0
  32. package/ios/archive/sherpa-onnx-archive-helper.mm +12 -0
  33. package/ios/model_detect/sherpa-onnx-model-detect-helper.h +37 -3
  34. package/ios/model_detect/sherpa-onnx-model-detect-helper.mm +80 -45
  35. package/ios/model_detect/sherpa-onnx-model-detect-stt.mm +629 -267
  36. package/ios/model_detect/sherpa-onnx-model-detect-tts.mm +148 -56
  37. package/ios/model_detect/sherpa-onnx-model-detect.h +72 -0
  38. package/ios/model_detect/sherpa-onnx-validate-stt.h +38 -0
  39. package/ios/model_detect/sherpa-onnx-validate-stt.mm +229 -0
  40. package/ios/model_detect/sherpa-onnx-validate-tts.h +38 -0
  41. package/ios/model_detect/sherpa-onnx-validate-tts.mm +144 -0
  42. package/ios/stt/sherpa-onnx-stt-wrapper.mm +4 -0
  43. package/lib/module/NativeSherpaOnnx.js.map +1 -1
  44. package/lib/module/audio/index.js +55 -1
  45. package/lib/module/audio/index.js.map +1 -1
  46. package/lib/module/download/ModelDownloadManager.js +14 -0
  47. package/lib/module/download/ModelDownloadManager.js.map +1 -1
  48. package/lib/module/index.js +10 -0
  49. package/lib/module/index.js.map +1 -1
  50. package/lib/module/stt/streaming.js +6 -3
  51. package/lib/module/stt/streaming.js.map +1 -1
  52. package/lib/module/tts/index.js +13 -1
  53. package/lib/module/tts/index.js.map +1 -1
  54. package/lib/typescript/src/NativeSherpaOnnx.d.ts +32 -3
  55. package/lib/typescript/src/NativeSherpaOnnx.d.ts.map +1 -1
  56. package/lib/typescript/src/audio/index.d.ts +20 -1
  57. package/lib/typescript/src/audio/index.d.ts.map +1 -1
  58. package/lib/typescript/src/download/ModelDownloadManager.d.ts +2 -1
  59. package/lib/typescript/src/download/ModelDownloadManager.d.ts.map +1 -1
  60. package/lib/typescript/src/index.d.ts +10 -0
  61. package/lib/typescript/src/index.d.ts.map +1 -1
  62. package/lib/typescript/src/stt/streaming.d.ts.map +1 -1
  63. package/lib/typescript/src/stt/streamingTypes.d.ts +1 -1
  64. package/lib/typescript/src/stt/streamingTypes.d.ts.map +1 -1
  65. package/lib/typescript/src/tts/index.d.ts +12 -1
  66. package/lib/typescript/src/tts/index.d.ts.map +1 -1
  67. package/package.json +6 -1
  68. package/scripts/check-model-csvs.sh +72 -0
  69. package/scripts/setup-ios-framework.sh +272 -191
  70. package/src/NativeSherpaOnnx.ts +37 -3
  71. package/src/audio/index.ts +84 -1
  72. package/src/download/ModelDownloadManager.ts +19 -0
  73. package/src/index.tsx +15 -0
  74. package/src/stt/streaming.ts +10 -5
  75. package/src/stt/streamingTypes.ts +1 -1
  76. package/src/tts/index.ts +25 -1
  77. package/third_party/ffmpeg_prebuilt/ANDROID_RELEASE_TAG +1 -1
  78. package/third_party/libarchive_prebuilt/ANDROID_RELEASE_TAG +1 -1
  79. package/third_party/libarchive_prebuilt/IOS_RELEASE_TAG +1 -1
  80. package/third_party/sherpa-onnx-prebuilt/ANDROID_RELEASE_TAG +1 -1
  81. package/third_party/sherpa-onnx-prebuilt/IOS_RELEASE_TAG +1 -1
  82. package/ios/scripts/patch-libarchive-includes.sh +0 -61
  83. package/ios/scripts/setup-ios-libarchive.sh +0 -98
@@ -1,16 +1,50 @@
1
1
  /**
2
2
  * sherpa-onnx-model-detect-tts.cpp
3
3
  *
4
- * Purpose: Detects TTS model type and fills TtsModelPaths from a model directory. Supports Vits,
5
- * Piper, Kokoro, Zipvoice, Pocket, etc. Used by nativeDetectTtsModel (module-jni).
4
+ * Purpose: Detects TTS model type and fills TtsModelPaths from a model directory. Used by
5
+ * nativeDetectTtsModel (module-jni). Supports Vits, Matcha, Kokoro, Kitten, Pocket, Zipvoice.
6
+ *
7
+ * --- Detection pipeline (overview) ---
8
+ *
9
+ * 1. Gather files in modelDir (recursive), then map file names to logical paths (ttsModel,
10
+ * acousticModel, vocoder, encoder, decoder, lmFlow, lmMain, textConditioner, tokens, lexicon,
11
+ * dataDir, voices, vocabJson, tokenScoresJson). Path hints from directory name (isLikelyVits,
12
+ * isLikelyKitten, isLikelyKokoro).
13
+ *
14
+ * 2. Capabilities (hasVits, hasMatcha, hasPocket, hasZipvoice, hasVoicesFile, hasDataDir): which
15
+ * model types are *possible* given the paths. Multiple can be true (e.g. voices.bin can satisfy
16
+ * both Kokoro and Kitten).
17
+ *
18
+ * 3. detectedModels (for UI "Select model type"): built from capabilities only. Every kind with
19
+ * the corresponding has* == true is added (with existing rules: zipvoice only if !hasMatcha,
20
+ * vits when hasVits and no voices or ambiguous folder name).
21
+ *
22
+ * 4. selectedKind: from ResolveTtsKind(). If modelType is explicit, use it if capabilities allow.
23
+ * If modelType == "auto": Priority 1 = folder name (GetKindsFromDirNameTts: tokens like "vits",
24
+ * "matcha", "kokoro" in dir name --> candidate kinds). Priority 2 = among those candidates, pick
25
+ * the first that CapabilitySupportsTtsKind(). Fallback = file-only order (matcha --> pocket -->
26
+ * zipvoice --> kokoro/kitten --> vits).
27
+ *
28
+ * 5. paths: all gathered paths are written into result.paths; the selected kind determines which
29
+ * engine is used at runtime.
30
+ *
31
+ * Result to caller: ok, error, detectedModels (list), selectedKind (single), paths.
6
32
  */
7
33
  #include "sherpa-onnx-model-detect.h"
8
34
  #include "sherpa-onnx-model-detect-helper.h"
35
+ #include "sherpa-onnx-validate-tts.h"
36
+ #include <algorithm>
37
+ #include <string>
38
+ #include <vector>
39
+ #ifdef __ANDROID__
9
40
  #include <android/log.h>
10
-
11
41
  #define LOG_TAG "TtsModelDetect"
12
42
  #define LOGI(...) __android_log_print(ANDROID_LOG_INFO, LOG_TAG, __VA_ARGS__)
13
43
  #define LOGE(...) __android_log_print(ANDROID_LOG_ERROR, LOG_TAG, __VA_ARGS__)
44
+ #else
45
+ #define LOGI(...) ((void)0)
46
+ #define LOGE(...) ((void)0)
47
+ #endif
14
48
 
15
49
  namespace sherpaonnx {
16
50
  namespace {
@@ -25,94 +59,120 @@ TtsModelKind ParseTtsModelType(const std::string& modelType) {
25
59
  return TtsModelKind::kUnknown;
26
60
  }
27
61
 
28
- } // namespace
62
+ /** Returns true if the given kind is supported by the current paths and hints (required files present).
63
+ * data_dir (espeak-ng-data) is required only for Kitten and Kokoro (sherpa-onnx config Validate());
64
+ * VITS, Matcha, Zipvoice use it optionally; Pocket does not use it. */
65
+ static bool CapabilitySupportsTtsKind(
66
+ TtsModelKind kind,
67
+ bool hasVits,
68
+ bool hasMatcha,
69
+ bool hasPocket,
70
+ bool hasZipvoice,
71
+ bool hasVoicesFile,
72
+ bool hasDataDir
73
+ ) {
74
+ switch (kind) {
75
+ case TtsModelKind::kVits:
76
+ return hasVits;
77
+ case TtsModelKind::kMatcha:
78
+ return hasMatcha;
79
+ case TtsModelKind::kKokoro:
80
+ case TtsModelKind::kKitten:
81
+ return hasVoicesFile && hasDataDir;
82
+ case TtsModelKind::kPocket:
83
+ return hasPocket;
84
+ case TtsModelKind::kZipvoice:
85
+ return hasZipvoice;
86
+ default:
87
+ return false;
88
+ }
89
+ }
29
90
 
30
- TtsDetectResult DetectTtsModel(const std::string& modelDir, const std::string& modelType) {
91
+ /**
92
+ * Priority 1: Collect candidate TTS kinds from the model directory name (last path component).
93
+ * Tokens like "vits", "matcha", "kokoro" are matched case-insensitively. Returns candidates in a
94
+ * fixed priority order for file-based disambiguation when multiple names match.
95
+ */
96
+ static std::vector<TtsModelKind> GetKindsFromDirNameTts(const std::string& modelDir) {
31
97
  using namespace model_detect;
98
+ size_t pos = modelDir.find_last_of("/\\");
99
+ std::string base = (pos == std::string::npos) ? modelDir : modelDir.substr(pos + 1);
100
+ std::string lower = ToLower(base);
32
101
 
33
- TtsDetectResult result;
34
-
35
- LOGI("DetectTtsModel: modelDir=%s, modelType=%s", modelDir.c_str(), modelType.c_str());
102
+ std::vector<TtsModelKind> out;
103
+ auto add = [&out](TtsModelKind k) {
104
+ if (std::find(out.begin(), out.end(), k) == out.end())
105
+ out.push_back(k);
106
+ };
36
107
 
37
- if (modelDir.empty()) {
38
- result.error = "TTS: Model directory is empty";
39
- LOGE("%s", result.error.c_str());
40
- return result;
41
- }
108
+ if (lower.find("matcha") != std::string::npos) add(TtsModelKind::kMatcha);
109
+ if (lower.find("pocket") != std::string::npos) add(TtsModelKind::kPocket);
110
+ if (lower.find("zipvoice") != std::string::npos) add(TtsModelKind::kZipvoice);
111
+ if (lower.find("kokoro") != std::string::npos) add(TtsModelKind::kKokoro);
112
+ if (lower.find("kitten") != std::string::npos) add(TtsModelKind::kKitten);
113
+ if (lower.find("vits") != std::string::npos) add(TtsModelKind::kVits);
42
114
 
43
- if (!FileExists(modelDir) || !IsDirectory(modelDir)) {
44
- result.error = "TTS: Model directory does not exist or is not a directory: " + modelDir;
45
- LOGE("%s", result.error.c_str());
46
- return result;
47
- }
115
+ return out;
116
+ }
48
117
 
49
- const auto files = ListFilesRecursive(modelDir, 4);
50
- LOGI("DetectTtsModel: Found %zu files in %s", files.size(), modelDir.c_str());
51
- for (const auto& f : files) {
52
- LOGI(" file: %s (size=%llu)", f.path.c_str(), (unsigned long long)f.size);
53
- }
118
+ /** Shared detection logic: runs on a pre-built file list. No filesystem access, no logging. */
119
+ static TtsDetectResult DetectTtsModelFromFiles(
120
+ const std::vector<model_detect::FileEntry>& files,
121
+ const std::string& modelDir,
122
+ const std::string& modelType
123
+ ) {
124
+ using namespace model_detect;
54
125
 
55
- std::string tokensFile = FindFileByName(modelDir, "tokens.txt", 2);
56
- std::string lexiconFile = FindFileByName(modelDir, "lexicon.txt", 2);
57
- std::string dataDirPath = FindDirectoryByName(modelDir, "espeak-ng-data", 2);
58
- std::string voicesFile = FindFileByName(modelDir, "voices.bin", 2);
126
+ TtsDetectResult result;
59
127
 
60
- LOGI("DetectTtsModel: tokens=%s, lexicon=%s, dataDir=%s, voices=%s",
61
- tokensFile.c_str(), lexiconFile.c_str(), dataDirPath.c_str(), voicesFile.c_str());
128
+ std::string tokensFile = FindFileByName(files, "tokens.txt");
129
+ std::vector<LexiconCandidate> lexiconCandidates = FindLexiconCandidates(files, modelDir);
130
+ std::string dataDirPath = FindDirectoryUnderRoot(files, modelDir, "espeak-ng-data");
131
+ std::string voicesFile = FindFileByName(files, "voices.bin");
62
132
 
63
133
  std::string acousticModel = FindOnnxByAnyToken(files, {"acoustic_model", "acoustic-model"}, std::nullopt);
64
- // Note: matches either a "vocoder" or "vocos" ONNX file; both are stored in this field.
65
134
  std::string vocoder = FindOnnxByAnyToken(files, {"vocoder", "vocos"}, std::nullopt);
66
135
  std::string encoder = FindOnnxByAnyToken(files, {"encoder"}, std::nullopt);
67
136
  std::string decoder = FindOnnxByAnyToken(files, {"decoder"}, std::nullopt);
68
137
  std::string lmFlow = FindOnnxByAnyToken(files, {"lm_flow", "lm-flow"}, std::nullopt);
69
138
  std::string lmMain = FindOnnxByAnyToken(files, {"lm_main", "lm-main"}, std::nullopt);
70
139
  std::string textConditioner = FindOnnxByAnyToken(files, {"text_conditioner", "text-conditioner"}, std::nullopt);
71
- std::string vocabJsonFile = FindFileByName(modelDir, "vocab.json", 2);
72
- std::string tokenScoresJsonFile = FindFileByName(modelDir, "token_scores.json", 2);
73
-
74
- LOGI("DetectTtsModel: acousticModel=%s, vocoder=%s, encoder=%s, decoder=%s",
75
- acousticModel.c_str(), vocoder.c_str(), encoder.c_str(), decoder.c_str());
76
- LOGI("DetectTtsModel: lmFlow=%s, lmMain=%s, textConditioner=%s, vocabJson=%s, tokenScoresJson=%s",
77
- lmFlow.c_str(), lmMain.c_str(), textConditioner.c_str(), vocabJsonFile.c_str(), tokenScoresJsonFile.c_str());
140
+ std::string vocabJsonFile = FindFileByName(files, "vocab.json");
141
+ std::string tokenScoresJsonFile = FindFileByName(files, "token_scores.json");
78
142
 
79
143
  std::vector<std::string> modelExcludes = {
80
- "acoustic",
81
- "vocoder",
82
- "encoder",
83
- "decoder",
84
- "joiner"
144
+ "acoustic", "vocoder", "encoder", "decoder", "joiner"
85
145
  };
86
-
87
146
  std::string ttsModel = FindOnnxByAnyToken(files, {"model"}, std::nullopt);
88
147
  if (ttsModel.empty()) {
89
148
  ttsModel = FindLargestOnnxExcludingTokens(files, modelExcludes);
90
149
  }
91
- LOGI("DetectTtsModel: ttsModel=%s", ttsModel.c_str());
92
150
 
93
151
  bool hasVits = !ttsModel.empty();
94
- bool hasMatcha = !acousticModel.empty() && !vocoder.empty();
95
- bool hasVoicesFile = !voicesFile.empty() && FileExists(voicesFile);
96
- // Zipvoice requires encoder + decoder + vocoder (full model). Distill variants (no vocoder) are not supported by the native layer.
152
+ std::string modelDirLower = ToLower(modelDir);
153
+ bool isLikelyMatcha = modelDirLower.find("matcha") != std::string::npos;
154
+ bool hasMatcha = (!acousticModel.empty() && !vocoder.empty())
155
+ || (isLikelyMatcha && !ttsModel.empty() && !tokensFile.empty());
156
+ if (hasMatcha && acousticModel.empty())
157
+ acousticModel = ttsModel; // single-file Matcha: model.onnx is the acoustic model
158
+ bool hasVoicesFile = !voicesFile.empty();
159
+ bool isLikelyZipvoice = modelDirLower.find("zipvoice") != std::string::npos;
97
160
  bool hasZipvoice = !encoder.empty() && !decoder.empty() && !vocoder.empty();
161
+ if (isLikelyZipvoice && !encoder.empty() && !decoder.empty() && vocoder.empty()) {
162
+ result.ok = false;
163
+ result.error = "TTS: Zipvoice distill variant (no vocoder) is not supported. Use a full Zipvoice model with vocoder or add vocos_24khz.onnx separately.";
164
+ return result;
165
+ }
98
166
  bool hasPocket = !lmFlow.empty() && !lmMain.empty() && !encoder.empty() && !decoder.empty() &&
99
- !textConditioner.empty() && !vocabJsonFile.empty() && FileExists(vocabJsonFile) &&
100
- !tokenScoresJsonFile.empty() && FileExists(tokenScoresJsonFile);
101
- bool hasDataDir = !dataDirPath.empty() && IsDirectory(dataDirPath);
167
+ !textConditioner.empty() && !vocabJsonFile.empty() && !tokenScoresJsonFile.empty();
168
+ bool hasDataDir = !dataDirPath.empty();
102
169
 
103
- std::string modelDirLower = ToLower(modelDir);
104
170
  bool isLikelyKitten = modelDirLower.find("kitten") != std::string::npos;
105
171
  bool isLikelyKokoro = modelDirLower.find("kokoro") != std::string::npos;
106
172
 
107
- if (hasMatcha) {
108
- result.detectedModels.push_back({"matcha", modelDir});
109
- }
110
- if (hasPocket) {
111
- result.detectedModels.push_back({"pocket", modelDir});
112
- }
113
- if (hasZipvoice && !hasMatcha) {
114
- result.detectedModels.push_back({"zipvoice", modelDir});
115
- }
173
+ if (hasMatcha) result.detectedModels.push_back({"matcha", modelDir});
174
+ if (hasPocket) result.detectedModels.push_back({"pocket", modelDir});
175
+ if (hasZipvoice && !hasMatcha) result.detectedModels.push_back({"zipvoice", modelDir});
116
176
  if (hasVoicesFile) {
117
177
  if (isLikelyKitten && !isLikelyKokoro) {
118
178
  result.detectedModels.push_back({"kitten", modelDir});
@@ -123,23 +183,11 @@ TtsDetectResult DetectTtsModel(const std::string& modelDir, const std::string& m
123
183
  result.detectedModels.push_back({"kitten", modelDir});
124
184
  }
125
185
  }
126
-
127
186
  if (hasVits) {
128
187
  bool isLikelyVits = modelDirLower.find("vits") != std::string::npos;
129
188
  bool voicesAmbiguous = !isLikelyKitten && !isLikelyKokoro;
130
-
131
- bool addVits = false;
132
- if (!hasVoicesFile) {
133
- addVits = true;
134
- } else {
135
- if (isLikelyVits || voicesAmbiguous) {
136
- addVits = true;
137
- }
138
- }
139
-
140
- if (addVits) {
141
- result.detectedModels.push_back({"vits", modelDir});
142
- }
189
+ bool addVits = !hasVoicesFile || isLikelyVits || voicesAmbiguous;
190
+ if (addVits) result.detectedModels.push_back({"vits", modelDir});
143
191
  }
144
192
 
145
193
  TtsModelKind selected = TtsModelKind::kUnknown;
@@ -150,22 +198,25 @@ TtsDetectResult DetectTtsModel(const std::string& modelDir, const std::string& m
150
198
  return result;
151
199
  }
152
200
  } else {
153
- if (hasMatcha) {
154
- selected = TtsModelKind::kMatcha;
155
- } else if (hasPocket) {
156
- selected = TtsModelKind::kPocket;
157
- } else if (hasZipvoice) {
158
- selected = TtsModelKind::kZipvoice;
159
- } else if (hasVoicesFile) {
160
- if (isLikelyKitten && !isLikelyKokoro) {
161
- selected = TtsModelKind::kKitten;
162
- } else if (isLikelyKokoro && !isLikelyKitten) {
163
- selected = TtsModelKind::kKokoro;
164
- } else {
165
- selected = TtsModelKind::kKokoro;
201
+ std::vector<TtsModelKind> nameCandidates = GetKindsFromDirNameTts(modelDir);
202
+ if (!nameCandidates.empty()) {
203
+ for (TtsModelKind k : nameCandidates) {
204
+ if (CapabilitySupportsTtsKind(k, hasVits, hasMatcha, hasPocket, hasZipvoice,
205
+ hasVoicesFile, hasDataDir)) {
206
+ selected = k;
207
+ break;
208
+ }
166
209
  }
167
- } else if (hasVits) {
168
- selected = TtsModelKind::kVits;
210
+ }
211
+ if (selected == TtsModelKind::kUnknown) {
212
+ if (hasMatcha) selected = TtsModelKind::kMatcha;
213
+ else if (hasPocket) selected = TtsModelKind::kPocket;
214
+ else if (hasZipvoice) selected = TtsModelKind::kZipvoice;
215
+ else if (hasVoicesFile) {
216
+ if (isLikelyKitten && !isLikelyKokoro) selected = TtsModelKind::kKitten;
217
+ else if (isLikelyKokoro && !isLikelyKitten) selected = TtsModelKind::kKokoro;
218
+ else selected = TtsModelKind::kKokoro;
219
+ } else if (hasVits) selected = TtsModelKind::kVits;
169
220
  }
170
221
  }
171
222
 
@@ -174,39 +225,22 @@ TtsDetectResult DetectTtsModel(const std::string& modelDir, const std::string& m
174
225
  return result;
175
226
  }
176
227
 
177
- if (selected == TtsModelKind::kVits && !hasVits) {
178
- result.error = "TTS: VITS model requested but model file not found in " + modelDir;
179
- return result;
228
+ std::string lexiconPath;
229
+ for (const auto& c : lexiconCandidates) {
230
+ result.lexiconLanguageCandidates.push_back(c.languageId);
180
231
  }
181
- if (selected == TtsModelKind::kMatcha && !hasMatcha) {
182
- result.error = "TTS: Matcha model requested but required files not found in " + modelDir;
183
- return result;
184
- }
185
- if ((selected == TtsModelKind::kKokoro || selected == TtsModelKind::kKitten) && (!hasVits || !hasVoicesFile)) {
186
- result.error = "TTS: Kokoro/Kitten model requested but required files not found in " + modelDir;
187
- return result;
188
- }
189
- if (selected == TtsModelKind::kPocket && !hasPocket) {
190
- result.error = "TTS: Pocket model requested but required files not found in " + modelDir;
191
- return result;
192
- }
193
- if (selected == TtsModelKind::kZipvoice && !hasZipvoice) {
194
- result.error = "TTS: Zipvoice model requested but required files not found in " + modelDir;
195
- return result;
232
+ if (!lexiconCandidates.empty()) {
233
+ lexiconPath = lexiconCandidates[0].path;
196
234
  }
197
- if ((selected == TtsModelKind::kVits || selected == TtsModelKind::kMatcha ||
198
- selected == TtsModelKind::kKokoro || selected == TtsModelKind::kKitten ||
199
- selected == TtsModelKind::kZipvoice) &&
200
- !hasDataDir) {
201
- result.error = "TTS: espeak-ng-data not found in " + modelDir +
202
- ". Copy espeak-ng-data into the model directory.";
203
- return result;
235
+
236
+ if (selected == TtsModelKind::kMatcha && !acousticModel.empty() && vocoder.empty()) {
237
+ vocoder = acousticModel;
204
238
  }
205
239
 
206
240
  result.selectedKind = selected;
207
241
  result.paths.ttsModel = ttsModel;
208
242
  result.paths.tokens = tokensFile;
209
- result.paths.lexicon = !lexiconFile.empty() && FileExists(lexiconFile) ? lexiconFile : "";
243
+ result.paths.lexicon = lexiconPath;
210
244
  result.paths.dataDir = dataDirPath;
211
245
  result.paths.voices = voicesFile;
212
246
  result.paths.acousticModel = acousticModel;
@@ -219,20 +253,72 @@ TtsDetectResult DetectTtsModel(const std::string& modelDir, const std::string& m
219
253
  result.paths.vocabJson = vocabJsonFile;
220
254
  result.paths.tokenScoresJson = tokenScoresJsonFile;
221
255
 
222
- LOGI("DetectTtsModel: selected kind=%d, ttsModel=%s",
223
- static_cast<int>(selected), ttsModel.c_str());
224
- LOGI("DetectTtsModel: final paths — tokens=%s, dataDir=%s",
225
- result.paths.tokens.c_str(), result.paths.dataDir.c_str());
256
+ auto validation = ValidateTtsPaths(selected, result.paths, modelDir);
257
+ if (!validation.ok) {
258
+ result.ok = false;
259
+ result.error = validation.error;
260
+ return result;
261
+ }
226
262
 
227
- if (selected != TtsModelKind::kPocket && (tokensFile.empty() || !FileExists(tokensFile))) {
228
- result.error = "TTS: tokens.txt not found in " + modelDir;
263
+ result.ok = true;
264
+ return result;
265
+ }
266
+
267
+ } // namespace
268
+
269
+ TtsDetectResult DetectTtsModel(const std::string& modelDir, const std::string& modelType) {
270
+ using namespace model_detect;
271
+
272
+ TtsDetectResult result;
273
+
274
+ LOGI("DetectTtsModel: modelDir=%s, modelType=%s", modelDir.c_str(), modelType.c_str());
275
+
276
+ if (modelDir.empty()) {
277
+ result.error = "TTS: Model directory is empty";
229
278
  LOGE("%s", result.error.c_str());
230
279
  return result;
231
280
  }
232
281
 
233
- result.ok = true;
282
+ if (!FileExists(modelDir) || !IsDirectory(modelDir)) {
283
+ result.error = "TTS: Model directory does not exist or is not a directory: " + modelDir;
284
+ LOGE("%s", result.error.c_str());
285
+ return result;
286
+ }
287
+
288
+ const auto files = ListFilesRecursive(modelDir, 4);
289
+ LOGI("DetectTtsModel: Found %zu files in %s", files.size(), modelDir.c_str());
290
+ for (const auto& f : files) {
291
+ LOGI(" file: %s (size=%llu)", f.path.c_str(), (unsigned long long)f.size);
292
+ }
293
+
294
+ result = DetectTtsModelFromFiles(files, modelDir, modelType);
295
+ if (!result.ok) {
296
+ if (!result.error.empty()) LOGE("%s", result.error.c_str());
297
+ return result;
298
+ }
299
+ LOGI("DetectTtsModel: tokens=%s, lexicon=%s, dataDir=%s, voices=%s",
300
+ result.paths.tokens.c_str(), result.paths.lexicon.c_str(),
301
+ result.paths.dataDir.c_str(), result.paths.voices.c_str());
302
+ LOGI("DetectTtsModel: selected kind=%d, ttsModel=%s",
303
+ static_cast<int>(result.selectedKind), result.paths.ttsModel.c_str());
304
+ LOGI("DetectTtsModel: final paths — tokens=%s, dataDir=%s",
305
+ result.paths.tokens.c_str(), result.paths.dataDir.c_str());
234
306
  LOGI("DetectTtsModel: detection OK for %s", modelDir.c_str());
235
307
  return result;
236
308
  }
237
309
 
310
+ // Test-only: used by host-side model_detect_test; not used in production (Android/iOS use DetectTtsModel).
311
+ TtsDetectResult DetectTtsModelFromFileList(
312
+ const std::vector<model_detect::FileEntry>& files,
313
+ const std::string& modelDir,
314
+ const std::string& modelType
315
+ ) {
316
+ TtsDetectResult result;
317
+ if (modelDir.empty()) {
318
+ result.error = "TTS: Model directory is empty";
319
+ return result;
320
+ }
321
+ return DetectTtsModelFromFiles(files, modelDir, modelType);
322
+ }
323
+
238
324
  } // namespace sherpaonnx
@@ -2,6 +2,7 @@
2
2
  #define SHERPA_ONNX_MODEL_DETECT_H
3
3
 
4
4
  #include "sherpa-onnx-common.h"
5
+ #include "sherpa-onnx-model-detect-helper.h"
5
6
  #include <optional>
6
7
  #include <string>
7
8
  #include <vector>
@@ -21,6 +22,7 @@ enum class SttModelKind {
21
22
  kFunAsrNano,
22
23
  kFireRedAsr,
23
24
  kMoonshine,
25
+ kMoonshineV2,
24
26
  kDolphin,
25
27
  kCanary,
26
28
  kOmnilingual,
@@ -59,6 +61,8 @@ struct SttModelPaths {
59
61
  std::string moonshineEncoder;
60
62
  std::string moonshineUncachedDecoder;
61
63
  std::string moonshineCachedDecoder;
64
+ /** Moonshine v2: encoder + mergedDecoder (reuse moonshineEncoder for encoder path). */
65
+ std::string moonshineMergedDecoder;
62
66
  // Dolphin, Omnilingual, MedAsr, TeleSpeech (single model each)
63
67
  std::string dolphinModel;
64
68
  std::string omnilingualModel;
@@ -71,6 +75,69 @@ struct SttModelPaths {
71
75
  std::string canaryDecoder;
72
76
  };
73
77
 
78
+ /** All candidate paths gathered before model kind selection (used by STT detection steps). */
79
+ struct SttCandidatePaths {
80
+ std::string encoder;
81
+ std::string decoder;
82
+ std::string joiner;
83
+ std::string paraformerModel;
84
+ std::string ctcModel;
85
+ std::string tokens;
86
+ std::string bpeVocab;
87
+ std::string funasrEncoderAdaptor;
88
+ std::string funasrLLM;
89
+ std::string funasrEmbedding;
90
+ std::string funasrTokenizerDir;
91
+ std::string moonshinePreprocessor;
92
+ std::string moonshineEncoder;
93
+ std::string moonshineUncachedDecoder;
94
+ std::string moonshineCachedDecoder;
95
+ std::string moonshineMergedDecoder;
96
+ std::string encoderForV2;
97
+ };
98
+
99
+ /** Path hints derived from model directory name (isLikely* flags). */
100
+ struct SttPathHints {
101
+ bool isLikelyNemo = false;
102
+ bool isLikelyTdt = false;
103
+ bool isLikelyWenetCtc = false;
104
+ bool isLikelySenseVoice = false;
105
+ bool isLikelyFunAsrNano = false;
106
+ bool isLikelyZipformer = false;
107
+ bool isLikelyMoonshine = false;
108
+ bool isLikelyDolphin = false;
109
+ bool isLikelyFireRedAsr = false;
110
+ bool isLikelyCanary = false;
111
+ bool isLikelyOmnilingual = false;
112
+ bool isLikelyMedAsr = false;
113
+ bool isLikelyTeleSpeech = false;
114
+ bool isLikelyToneCtc = false;
115
+ bool isLikelyParaformer = false;
116
+ /** VAD (silero, ten-vad, etc.): not yet supported; when true, detection returns unsupported. */
117
+ bool isLikelyVad = false;
118
+ /** TDNN (keyword/yesno): not yet supported; when true, detection returns unsupported. */
119
+ bool isLikelyTdnn = false;
120
+ };
121
+
122
+ /** Which model types are possible given paths and hints (has* flags). */
123
+ struct SttCapabilities {
124
+ bool hasTransducer = false;
125
+ bool hasWhisper = false;
126
+ bool hasMoonshine = false;
127
+ bool hasMoonshineV2 = false;
128
+ bool hasParaformer = false;
129
+ bool hasFunAsrNano = false;
130
+ bool hasDolphin = false;
131
+ bool hasFireRedAsr = false;
132
+ /** True when dir name suggests Fire Red but only a single CTC/paraformer model (no encoder/decoder). Use zipformer_ctc. */
133
+ bool hasFireRedCtc = false;
134
+ bool hasCanary = false;
135
+ bool hasOmnilingual = false;
136
+ bool hasMedAsr = false;
137
+ bool hasTeleSpeechCtc = false;
138
+ bool hasToneCtc = false;
139
+ };
140
+
74
141
  struct TtsModelPaths {
75
142
  std::string ttsModel;
76
143
  std::string tokens;
@@ -92,6 +159,8 @@ struct TtsModelPaths {
92
159
  struct SttDetectResult {
93
160
  bool ok = false;
94
161
  std::string error;
162
+ /** True when detection failed because the model is for unsupported hardware (RK35xx, Ascend, CANN, etc.). */
163
+ bool isHardwareSpecificUnsupported = false;
95
164
  std::vector<DetectedModel> detectedModels;
96
165
  SttModelKind selectedKind = SttModelKind::kUnknown;
97
166
  bool tokensRequired = true;
@@ -104,6 +173,8 @@ struct TtsDetectResult {
104
173
  std::vector<DetectedModel> detectedModels;
105
174
  TtsModelKind selectedKind = TtsModelKind::kUnknown;
106
175
  TtsModelPaths paths;
176
+ /** Language ids from detected lexicon files (e.g. "default", "us-en", "zh") for multi-lang Kokoro/Kitten. Empty when not applicable. */
177
+ std::vector<std::string> lexiconLanguageCandidates;
107
178
  };
108
179
 
109
180
  SttDetectResult DetectSttModel(
@@ -113,11 +184,32 @@ SttDetectResult DetectSttModel(
113
184
  bool debug = false
114
185
  );
115
186
 
187
+ /** Test-only: Like DetectSttModel but takes a pre-built file list; no filesystem access.
188
+ * Only used by the host-side C++ test suite (test/cpp/model_detect_test.cpp). Not used in
189
+ * production (Android/iOS use DetectSttModel). Does not validate modelDir existence or
190
+ * call FileExists on tokens/bpeVocab. */
191
+ SttDetectResult DetectSttModelFromFileList(
192
+ const std::vector<model_detect::FileEntry>& files,
193
+ const std::string& modelDir,
194
+ const std::optional<bool>& preferInt8 = std::nullopt,
195
+ const std::optional<std::string>& modelType = std::nullopt
196
+ );
197
+
116
198
  TtsDetectResult DetectTtsModel(
117
199
  const std::string& modelDir,
118
200
  const std::string& modelType
119
201
  );
120
202
 
203
+ /** Test-only: Like DetectTtsModel but takes a pre-built file list; no filesystem access.
204
+ * Only used by the host-side C++ test suite (test/cpp/model_detect_test.cpp). Not used in
205
+ * production (Android/iOS use DetectTtsModel). Does not validate modelDir existence or
206
+ * call FileExists / IsDirectory. */
207
+ TtsDetectResult DetectTtsModelFromFileList(
208
+ const std::vector<model_detect::FileEntry>& files,
209
+ const std::string& modelDir,
210
+ const std::string& modelType = "auto"
211
+ );
212
+
121
213
  } // namespace sherpaonnx
122
214
 
123
215
  #endif // SHERPA_ONNX_MODEL_DETECT_H
@@ -25,6 +25,7 @@ const char* SttModelKindToString(SttModelKind k) {
25
25
  case SttModelKind::kFunAsrNano: return "funasr_nano";
26
26
  case SttModelKind::kFireRedAsr: return "fire_red_asr";
27
27
  case SttModelKind::kMoonshine: return "moonshine";
28
+ case SttModelKind::kMoonshineV2: return "moonshine_v2";
28
29
  case SttModelKind::kDolphin: return "dolphin";
29
30
  case SttModelKind::kCanary: return "canary";
30
31
  case SttModelKind::kOmnilingual: return "omnilingual";
@@ -52,6 +53,7 @@ jobject SttDetectResultToJava(JNIEnv* env, const SttDetectResult& result) {
52
53
 
53
54
  PutBoolean(env, map, mapPut, "success", result.ok);
54
55
  PutString(env, map, mapPut, "error", result.error);
56
+ PutBoolean(env, map, mapPut, "isHardwareSpecificUnsupported", result.isHardwareSpecificUnsupported);
55
57
  PutString(env, map, mapPut, "modelType", SttModelKindToString(result.selectedKind));
56
58
 
57
59
  jobject detectedList = BuildDetectedModelsList(env, result.detectedModels);
@@ -81,6 +83,7 @@ jobject SttDetectResultToJava(JNIEnv* env, const SttDetectResult& result) {
81
83
  PutString(env, pathsMap, mapPut, "moonshineEncoder", result.paths.moonshineEncoder);
82
84
  PutString(env, pathsMap, mapPut, "moonshineUncachedDecoder", result.paths.moonshineUncachedDecoder);
83
85
  PutString(env, pathsMap, mapPut, "moonshineCachedDecoder", result.paths.moonshineCachedDecoder);
86
+ PutString(env, pathsMap, mapPut, "moonshineMergedDecoder", result.paths.moonshineMergedDecoder);
84
87
  PutString(env, pathsMap, mapPut, "dolphinModel", result.paths.dolphinModel);
85
88
  PutString(env, pathsMap, mapPut, "omnilingualModel", result.paths.omnilingualModel);
86
89
  PutString(env, pathsMap, mapPut, "medasrModel", result.paths.medasrModel);
@@ -45,10 +45,20 @@ jobject TtsDetectResultToJava(JNIEnv* env, const TtsDetectResult& result) {
45
45
 
46
46
  jobject detectedList = BuildDetectedModelsList(env, result.detectedModels);
47
47
  if (detectedList) {
48
- env->CallObjectMethod(map, mapPut, env->NewStringUTF("detectedModels"), detectedList);
48
+ jstring keyDetected = env->NewStringUTF("detectedModels");
49
+ env->CallObjectMethod(map, mapPut, keyDetected, detectedList);
50
+ env->DeleteLocalRef(keyDetected);
49
51
  env->DeleteLocalRef(detectedList);
50
52
  }
51
53
 
54
+ jobject langCandidatesList = BuildStringList(env, result.lexiconLanguageCandidates);
55
+ if (langCandidatesList) {
56
+ jstring keyLangCandidates = env->NewStringUTF("lexiconLanguageCandidates");
57
+ env->CallObjectMethod(map, mapPut, keyLangCandidates, langCandidatesList);
58
+ env->DeleteLocalRef(keyLangCandidates);
59
+ env->DeleteLocalRef(langCandidatesList);
60
+ }
61
+
52
62
  jclass hashMapClass = env->FindClass("java/util/HashMap");
53
63
  if (hashMapClass) {
54
64
  jobject pathsMap = env->NewObject(hashMapClass, mapInit);
@@ -68,7 +78,9 @@ jobject TtsDetectResultToJava(JNIEnv* env, const TtsDetectResult& result) {
68
78
  PutString(env, pathsMap, mapPut, "textConditioner", result.paths.textConditioner);
69
79
  PutString(env, pathsMap, mapPut, "vocabJson", result.paths.vocabJson);
70
80
  PutString(env, pathsMap, mapPut, "tokenScoresJson", result.paths.tokenScoresJson);
71
- env->CallObjectMethod(map, mapPut, env->NewStringUTF("paths"), pathsMap);
81
+ jstring keyPaths = env->NewStringUTF("paths");
82
+ env->CallObjectMethod(map, mapPut, keyPaths, pathsMap);
83
+ env->DeleteLocalRef(keyPaths);
72
84
  env->DeleteLocalRef(pathsMap);
73
85
  }
74
86
  }