react-native-sherpa-onnx 0.3.2 → 0.3.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. package/README.md +28 -15
  2. package/SherpaOnnx.podspec +13 -5
  3. package/android/prebuilt-download.gradle +18 -5
  4. package/android/prebuilt-versions.gradle +8 -4
  5. package/android/src/main/cpp/jni/model_detect/sherpa-onnx-model-detect-helper.cpp +43 -142
  6. package/android/src/main/cpp/jni/model_detect/sherpa-onnx-model-detect-helper.h +12 -4
  7. package/android/src/main/cpp/jni/model_detect/sherpa-onnx-model-detect-stt.cpp +694 -307
  8. package/android/src/main/cpp/jni/model_detect/sherpa-onnx-model-detect-tts.cpp +194 -99
  9. package/android/src/main/cpp/jni/model_detect/sherpa-onnx-model-detect.h +90 -0
  10. package/android/src/main/cpp/jni/model_detect/sherpa-onnx-stt-wrapper.cpp +3 -0
  11. package/android/src/main/java/com/sherpaonnx/SherpaOnnxModule.kt +70 -0
  12. package/android/src/main/java/com/sherpaonnx/SherpaOnnxPcmCapture.kt +150 -0
  13. package/android/src/main/java/com/sherpaonnx/SherpaOnnxSttHelper.kt +39 -19
  14. package/ios/SherpaOnnx+PcmLiveStream.mm +288 -0
  15. package/ios/SherpaOnnx+STT.mm +2 -0
  16. package/ios/SherpaOnnx.mm +1 -1
  17. package/ios/model_detect/sherpa-onnx-model-detect-helper.h +9 -3
  18. package/ios/model_detect/sherpa-onnx-model-detect-helper.mm +38 -54
  19. package/ios/model_detect/sherpa-onnx-model-detect-stt.mm +620 -267
  20. package/ios/model_detect/sherpa-onnx-model-detect-tts.mm +131 -28
  21. package/ios/model_detect/sherpa-onnx-model-detect.h +70 -0
  22. package/ios/stt/sherpa-onnx-stt-wrapper.mm +4 -0
  23. package/lib/module/NativeSherpaOnnx.js.map +1 -1
  24. package/lib/module/audio/index.js +52 -0
  25. package/lib/module/audio/index.js.map +1 -1
  26. package/lib/module/stt/streaming.js +6 -3
  27. package/lib/module/stt/streaming.js.map +1 -1
  28. package/lib/typescript/src/NativeSherpaOnnx.d.ts +16 -2
  29. package/lib/typescript/src/NativeSherpaOnnx.d.ts.map +1 -1
  30. package/lib/typescript/src/audio/index.d.ts +17 -0
  31. package/lib/typescript/src/audio/index.d.ts.map +1 -1
  32. package/lib/typescript/src/stt/streaming.d.ts.map +1 -1
  33. package/lib/typescript/src/stt/streamingTypes.d.ts +1 -1
  34. package/lib/typescript/src/stt/streamingTypes.d.ts.map +1 -1
  35. package/package.json +6 -1
  36. package/scripts/check-model-csvs.sh +72 -0
  37. package/scripts/setup-ios-framework.sh +48 -48
  38. package/src/NativeSherpaOnnx.ts +18 -2
  39. package/src/audio/index.ts +81 -0
  40. package/src/stt/streaming.ts +10 -5
  41. package/src/stt/streamingTypes.ts +1 -1
  42. package/third_party/sherpa-onnx-prebuilt/ANDROID_RELEASE_TAG +1 -1
  43. package/third_party/sherpa-onnx-prebuilt/IOS_RELEASE_TAG +1 -1
@@ -1,16 +1,49 @@
1
1
  /**
2
2
  * sherpa-onnx-model-detect-tts.cpp
3
3
  *
4
- * Purpose: Detects TTS model type and fills TtsModelPaths from a model directory. Supports Vits,
5
- * Piper, Kokoro, Zipvoice, Pocket, etc. Used by nativeDetectTtsModel (module-jni).
4
+ * Purpose: Detects TTS model type and fills TtsModelPaths from a model directory. Used by
5
+ * nativeDetectTtsModel (module-jni). Supports Vits, Matcha, Kokoro, Kitten, Pocket, Zipvoice.
6
+ *
7
+ * --- Detection pipeline (overview) ---
8
+ *
9
+ * 1. Gather files in modelDir (recursive), then map file names to logical paths (ttsModel,
10
+ * acousticModel, vocoder, encoder, decoder, lmFlow, lmMain, textConditioner, tokens, lexicon,
11
+ * dataDir, voices, vocabJson, tokenScoresJson). Path hints from directory name (isLikelyVits,
12
+ * isLikelyKitten, isLikelyKokoro).
13
+ *
14
+ * 2. Capabilities (hasVits, hasMatcha, hasPocket, hasZipvoice, hasVoicesFile, hasDataDir): which
15
+ * model types are *possible* given the paths. Multiple can be true (e.g. voices.bin can satisfy
16
+ * both Kokoro and Kitten).
17
+ *
18
+ * 3. detectedModels (for UI "Select model type"): built from capabilities only. Every kind with
19
+ * the corresponding has* == true is added (with existing rules: zipvoice only if !hasMatcha,
20
+ * vits when hasVits and no voices or ambiguous folder name).
21
+ *
22
+ * 4. selectedKind: from ResolveTtsKind(). If modelType is explicit, use it if capabilities allow.
23
+ * If modelType == "auto": Priority 1 = folder name (GetKindsFromDirNameTts: tokens like "vits",
24
+ * "matcha", "kokoro" in dir name → candidate kinds). Priority 2 = among those candidates, pick
25
+ * the first that CapabilitySupportsTtsKind(). Fallback = file-only order (matcha → pocket →
26
+ * zipvoice → kokoro/kitten → vits).
27
+ *
28
+ * 5. paths: all gathered paths are written into result.paths; the selected kind determines which
29
+ * engine is used at runtime.
30
+ *
31
+ * Result to caller: ok, error, detectedModels (list), selectedKind (single), paths.
6
32
  */
7
33
  #include "sherpa-onnx-model-detect.h"
8
34
  #include "sherpa-onnx-model-detect-helper.h"
35
+ #include <algorithm>
36
+ #include <string>
37
+ #include <vector>
38
+ #ifdef __ANDROID__
9
39
  #include <android/log.h>
10
-
11
40
  #define LOG_TAG "TtsModelDetect"
12
41
  #define LOGI(...) __android_log_print(ANDROID_LOG_INFO, LOG_TAG, __VA_ARGS__)
13
42
  #define LOGE(...) __android_log_print(ANDROID_LOG_ERROR, LOG_TAG, __VA_ARGS__)
43
+ #else
44
+ #define LOGI(...) ((void)0)
45
+ #define LOGE(...) ((void)0)
46
+ #endif
14
47
 
15
48
  namespace sherpaonnx {
16
49
  namespace {
@@ -25,94 +58,117 @@ TtsModelKind ParseTtsModelType(const std::string& modelType) {
25
58
  return TtsModelKind::kUnknown;
26
59
  }
27
60
 
28
- } // namespace
61
+ /** Returns true if the given kind is supported by the current paths and hints (required files present). */
62
+ static bool CapabilitySupportsTtsKind(
63
+ TtsModelKind kind,
64
+ bool hasVits,
65
+ bool hasMatcha,
66
+ bool hasPocket,
67
+ bool hasZipvoice,
68
+ bool hasVoicesFile,
69
+ bool hasDataDir
70
+ ) {
71
+ switch (kind) {
72
+ case TtsModelKind::kVits:
73
+ return hasVits && hasDataDir;
74
+ case TtsModelKind::kMatcha:
75
+ return hasMatcha && hasDataDir;
76
+ case TtsModelKind::kKokoro:
77
+ case TtsModelKind::kKitten:
78
+ return hasVoicesFile && hasDataDir;
79
+ case TtsModelKind::kPocket:
80
+ return hasPocket;
81
+ case TtsModelKind::kZipvoice:
82
+ return hasZipvoice;
83
+ default:
84
+ return false;
85
+ }
86
+ }
29
87
 
30
- TtsDetectResult DetectTtsModel(const std::string& modelDir, const std::string& modelType) {
88
+ /**
89
+ * Priority 1: Collect candidate TTS kinds from the model directory name (last path component).
90
+ * Tokens like "vits", "matcha", "kokoro" are matched case-insensitively. Returns candidates in a
91
+ * fixed priority order for file-based disambiguation when multiple names match.
92
+ */
93
+ static std::vector<TtsModelKind> GetKindsFromDirNameTts(const std::string& modelDir) {
31
94
  using namespace model_detect;
95
+ size_t pos = modelDir.find_last_of("/\\");
96
+ std::string base = (pos == std::string::npos) ? modelDir : modelDir.substr(pos + 1);
97
+ std::string lower = ToLower(base);
32
98
 
33
- TtsDetectResult result;
99
+ std::vector<TtsModelKind> out;
100
+ auto add = [&out](TtsModelKind k) {
101
+ if (std::find(out.begin(), out.end(), k) == out.end())
102
+ out.push_back(k);
103
+ };
34
104
 
35
- LOGI("DetectTtsModel: modelDir=%s, modelType=%s", modelDir.c_str(), modelType.c_str());
105
+ if (lower.find("matcha") != std::string::npos) add(TtsModelKind::kMatcha);
106
+ if (lower.find("pocket") != std::string::npos) add(TtsModelKind::kPocket);
107
+ if (lower.find("zipvoice") != std::string::npos) add(TtsModelKind::kZipvoice);
108
+ if (lower.find("kokoro") != std::string::npos) add(TtsModelKind::kKokoro);
109
+ if (lower.find("kitten") != std::string::npos) add(TtsModelKind::kKitten);
110
+ if (lower.find("vits") != std::string::npos) add(TtsModelKind::kVits);
36
111
 
37
- if (modelDir.empty()) {
38
- result.error = "TTS: Model directory is empty";
39
- LOGE("%s", result.error.c_str());
40
- return result;
41
- }
42
-
43
- if (!FileExists(modelDir) || !IsDirectory(modelDir)) {
44
- result.error = "TTS: Model directory does not exist or is not a directory: " + modelDir;
45
- LOGE("%s", result.error.c_str());
46
- return result;
47
- }
112
+ return out;
113
+ }
48
114
 
49
- const auto files = ListFilesRecursive(modelDir, 4);
50
- LOGI("DetectTtsModel: Found %zu files in %s", files.size(), modelDir.c_str());
51
- for (const auto& f : files) {
52
- LOGI(" file: %s (size=%llu)", f.path.c_str(), (unsigned long long)f.size);
53
- }
115
+ /** Shared detection logic: runs on a pre-built file list. No filesystem access, no logging. */
116
+ static TtsDetectResult DetectTtsModelFromFiles(
117
+ const std::vector<model_detect::FileEntry>& files,
118
+ const std::string& modelDir,
119
+ const std::string& modelType
120
+ ) {
121
+ using namespace model_detect;
54
122
 
55
- std::string tokensFile = FindFileByName(modelDir, "tokens.txt", 2);
56
- std::string lexiconFile = FindFileByName(modelDir, "lexicon.txt", 2);
57
- std::string dataDirPath = FindDirectoryByName(modelDir, "espeak-ng-data", 2);
58
- std::string voicesFile = FindFileByName(modelDir, "voices.bin", 2);
123
+ TtsDetectResult result;
59
124
 
60
- LOGI("DetectTtsModel: tokens=%s, lexicon=%s, dataDir=%s, voices=%s",
61
- tokensFile.c_str(), lexiconFile.c_str(), dataDirPath.c_str(), voicesFile.c_str());
125
+ std::string tokensFile = FindFileByName(files, "tokens.txt");
126
+ std::string lexiconFile = FindFileByName(files, "lexicon.txt");
127
+ std::string dataDirPath;
128
+ {
129
+ const std::string prefix = modelDir + "/espeak-ng-data/";
130
+ for (const auto& entry : files) {
131
+ if (entry.path.size() > prefix.size() && entry.path.compare(0, prefix.size(), prefix) == 0) {
132
+ dataDirPath = modelDir + "/espeak-ng-data";
133
+ break;
134
+ }
135
+ }
136
+ }
137
+ std::string voicesFile = FindFileByName(files, "voices.bin");
62
138
 
63
139
  std::string acousticModel = FindOnnxByAnyToken(files, {"acoustic_model", "acoustic-model"}, std::nullopt);
64
- // Note: matches either a "vocoder" or "vocos" ONNX file; both are stored in this field.
65
140
  std::string vocoder = FindOnnxByAnyToken(files, {"vocoder", "vocos"}, std::nullopt);
66
141
  std::string encoder = FindOnnxByAnyToken(files, {"encoder"}, std::nullopt);
67
142
  std::string decoder = FindOnnxByAnyToken(files, {"decoder"}, std::nullopt);
68
143
  std::string lmFlow = FindOnnxByAnyToken(files, {"lm_flow", "lm-flow"}, std::nullopt);
69
144
  std::string lmMain = FindOnnxByAnyToken(files, {"lm_main", "lm-main"}, std::nullopt);
70
145
  std::string textConditioner = FindOnnxByAnyToken(files, {"text_conditioner", "text-conditioner"}, std::nullopt);
71
- std::string vocabJsonFile = FindFileByName(modelDir, "vocab.json", 2);
72
- std::string tokenScoresJsonFile = FindFileByName(modelDir, "token_scores.json", 2);
73
-
74
- LOGI("DetectTtsModel: acousticModel=%s, vocoder=%s, encoder=%s, decoder=%s",
75
- acousticModel.c_str(), vocoder.c_str(), encoder.c_str(), decoder.c_str());
76
- LOGI("DetectTtsModel: lmFlow=%s, lmMain=%s, textConditioner=%s, vocabJson=%s, tokenScoresJson=%s",
77
- lmFlow.c_str(), lmMain.c_str(), textConditioner.c_str(), vocabJsonFile.c_str(), tokenScoresJsonFile.c_str());
146
+ std::string vocabJsonFile = FindFileByName(files, "vocab.json");
147
+ std::string tokenScoresJsonFile = FindFileByName(files, "token_scores.json");
78
148
 
79
149
  std::vector<std::string> modelExcludes = {
80
- "acoustic",
81
- "vocoder",
82
- "encoder",
83
- "decoder",
84
- "joiner"
150
+ "acoustic", "vocoder", "encoder", "decoder", "joiner"
85
151
  };
86
-
87
152
  std::string ttsModel = FindOnnxByAnyToken(files, {"model"}, std::nullopt);
88
153
  if (ttsModel.empty()) {
89
154
  ttsModel = FindLargestOnnxExcludingTokens(files, modelExcludes);
90
155
  }
91
- LOGI("DetectTtsModel: ttsModel=%s", ttsModel.c_str());
92
156
 
93
157
  bool hasVits = !ttsModel.empty();
94
158
  bool hasMatcha = !acousticModel.empty() && !vocoder.empty();
95
- bool hasVoicesFile = !voicesFile.empty() && FileExists(voicesFile);
96
- // Zipvoice requires encoder + decoder + vocoder (full model). Distill variants (no vocoder) are not supported by the native layer.
159
+ bool hasVoicesFile = !voicesFile.empty();
97
160
  bool hasZipvoice = !encoder.empty() && !decoder.empty() && !vocoder.empty();
98
161
  bool hasPocket = !lmFlow.empty() && !lmMain.empty() && !encoder.empty() && !decoder.empty() &&
99
- !textConditioner.empty() && !vocabJsonFile.empty() && FileExists(vocabJsonFile) &&
100
- !tokenScoresJsonFile.empty() && FileExists(tokenScoresJsonFile);
101
- bool hasDataDir = !dataDirPath.empty() && IsDirectory(dataDirPath);
162
+ !textConditioner.empty() && !vocabJsonFile.empty() && !tokenScoresJsonFile.empty();
163
+ bool hasDataDir = !dataDirPath.empty();
102
164
 
103
165
  std::string modelDirLower = ToLower(modelDir);
104
166
  bool isLikelyKitten = modelDirLower.find("kitten") != std::string::npos;
105
167
  bool isLikelyKokoro = modelDirLower.find("kokoro") != std::string::npos;
106
168
 
107
- if (hasMatcha) {
108
- result.detectedModels.push_back({"matcha", modelDir});
109
- }
110
- if (hasPocket) {
111
- result.detectedModels.push_back({"pocket", modelDir});
112
- }
113
- if (hasZipvoice && !hasMatcha) {
114
- result.detectedModels.push_back({"zipvoice", modelDir});
115
- }
169
+ if (hasMatcha) result.detectedModels.push_back({"matcha", modelDir});
170
+ if (hasPocket) result.detectedModels.push_back({"pocket", modelDir});
171
+ if (hasZipvoice && !hasMatcha) result.detectedModels.push_back({"zipvoice", modelDir});
116
172
  if (hasVoicesFile) {
117
173
  if (isLikelyKitten && !isLikelyKokoro) {
118
174
  result.detectedModels.push_back({"kitten", modelDir});
@@ -123,23 +179,11 @@ TtsDetectResult DetectTtsModel(const std::string& modelDir, const std::string& m
123
179
  result.detectedModels.push_back({"kitten", modelDir});
124
180
  }
125
181
  }
126
-
127
182
  if (hasVits) {
128
183
  bool isLikelyVits = modelDirLower.find("vits") != std::string::npos;
129
184
  bool voicesAmbiguous = !isLikelyKitten && !isLikelyKokoro;
130
-
131
- bool addVits = false;
132
- if (!hasVoicesFile) {
133
- addVits = true;
134
- } else {
135
- if (isLikelyVits || voicesAmbiguous) {
136
- addVits = true;
137
- }
138
- }
139
-
140
- if (addVits) {
141
- result.detectedModels.push_back({"vits", modelDir});
142
- }
185
+ bool addVits = !hasVoicesFile || isLikelyVits || voicesAmbiguous;
186
+ if (addVits) result.detectedModels.push_back({"vits", modelDir});
143
187
  }
144
188
 
145
189
  TtsModelKind selected = TtsModelKind::kUnknown;
@@ -150,22 +194,25 @@ TtsDetectResult DetectTtsModel(const std::string& modelDir, const std::string& m
150
194
  return result;
151
195
  }
152
196
  } else {
153
- if (hasMatcha) {
154
- selected = TtsModelKind::kMatcha;
155
- } else if (hasPocket) {
156
- selected = TtsModelKind::kPocket;
157
- } else if (hasZipvoice) {
158
- selected = TtsModelKind::kZipvoice;
159
- } else if (hasVoicesFile) {
160
- if (isLikelyKitten && !isLikelyKokoro) {
161
- selected = TtsModelKind::kKitten;
162
- } else if (isLikelyKokoro && !isLikelyKitten) {
163
- selected = TtsModelKind::kKokoro;
164
- } else {
165
- selected = TtsModelKind::kKokoro;
197
+ std::vector<TtsModelKind> nameCandidates = GetKindsFromDirNameTts(modelDir);
198
+ if (!nameCandidates.empty()) {
199
+ for (TtsModelKind k : nameCandidates) {
200
+ if (CapabilitySupportsTtsKind(k, hasVits, hasMatcha, hasPocket, hasZipvoice,
201
+ hasVoicesFile, hasDataDir)) {
202
+ selected = k;
203
+ break;
204
+ }
166
205
  }
167
- } else if (hasVits) {
168
- selected = TtsModelKind::kVits;
206
+ }
207
+ if (selected == TtsModelKind::kUnknown) {
208
+ if (hasMatcha) selected = TtsModelKind::kMatcha;
209
+ else if (hasPocket) selected = TtsModelKind::kPocket;
210
+ else if (hasZipvoice) selected = TtsModelKind::kZipvoice;
211
+ else if (hasVoicesFile) {
212
+ if (isLikelyKitten && !isLikelyKokoro) selected = TtsModelKind::kKitten;
213
+ else if (isLikelyKokoro && !isLikelyKitten) selected = TtsModelKind::kKokoro;
214
+ else selected = TtsModelKind::kKokoro;
215
+ } else if (hasVits) selected = TtsModelKind::kVits;
169
216
  }
170
217
  }
171
218
 
@@ -173,7 +220,6 @@ TtsDetectResult DetectTtsModel(const std::string& modelDir, const std::string& m
173
220
  result.error = "TTS: No compatible model type detected in " + modelDir;
174
221
  return result;
175
222
  }
176
-
177
223
  if (selected == TtsModelKind::kVits && !hasVits) {
178
224
  result.error = "TTS: VITS model requested but model file not found in " + modelDir;
179
225
  return result;
@@ -182,7 +228,7 @@ TtsDetectResult DetectTtsModel(const std::string& modelDir, const std::string& m
182
228
  result.error = "TTS: Matcha model requested but required files not found in " + modelDir;
183
229
  return result;
184
230
  }
185
- if ((selected == TtsModelKind::kKokoro || selected == TtsModelKind::kKitten) && (!hasVits || !hasVoicesFile)) {
231
+ if ((selected == TtsModelKind::kKokoro || selected == TtsModelKind::kKitten) && (!hasVoicesFile || !hasDataDir)) {
186
232
  result.error = "TTS: Kokoro/Kitten model requested but required files not found in " + modelDir;
187
233
  return result;
188
234
  }
@@ -196,8 +242,7 @@ TtsDetectResult DetectTtsModel(const std::string& modelDir, const std::string& m
196
242
  }
197
243
  if ((selected == TtsModelKind::kVits || selected == TtsModelKind::kMatcha ||
198
244
  selected == TtsModelKind::kKokoro || selected == TtsModelKind::kKitten ||
199
- selected == TtsModelKind::kZipvoice) &&
200
- !hasDataDir) {
245
+ selected == TtsModelKind::kZipvoice) && !hasDataDir) {
201
246
  result.error = "TTS: espeak-ng-data not found in " + modelDir +
202
247
  ". Copy espeak-ng-data into the model directory.";
203
248
  return result;
@@ -206,7 +251,7 @@ TtsDetectResult DetectTtsModel(const std::string& modelDir, const std::string& m
206
251
  result.selectedKind = selected;
207
252
  result.paths.ttsModel = ttsModel;
208
253
  result.paths.tokens = tokensFile;
209
- result.paths.lexicon = !lexiconFile.empty() && FileExists(lexiconFile) ? lexiconFile : "";
254
+ result.paths.lexicon = !lexiconFile.empty() ? lexiconFile : "";
210
255
  result.paths.dataDir = dataDirPath;
211
256
  result.paths.voices = voicesFile;
212
257
  result.paths.acousticModel = acousticModel;
@@ -219,20 +264,70 @@ TtsDetectResult DetectTtsModel(const std::string& modelDir, const std::string& m
219
264
  result.paths.vocabJson = vocabJsonFile;
220
265
  result.paths.tokenScoresJson = tokenScoresJsonFile;
221
266
 
222
- LOGI("DetectTtsModel: selected kind=%d, ttsModel=%s",
223
- static_cast<int>(selected), ttsModel.c_str());
224
- LOGI("DetectTtsModel: final paths — tokens=%s, dataDir=%s",
225
- result.paths.tokens.c_str(), result.paths.dataDir.c_str());
226
-
227
- if (selected != TtsModelKind::kPocket && (tokensFile.empty() || !FileExists(tokensFile))) {
267
+ if (selected != TtsModelKind::kPocket && tokensFile.empty()) {
228
268
  result.error = "TTS: tokens.txt not found in " + modelDir;
229
- LOGE("%s", result.error.c_str());
230
269
  return result;
231
270
  }
232
271
 
233
272
  result.ok = true;
273
+ return result;
274
+ }
275
+
276
+ } // namespace
277
+
278
+ TtsDetectResult DetectTtsModel(const std::string& modelDir, const std::string& modelType) {
279
+ using namespace model_detect;
280
+
281
+ TtsDetectResult result;
282
+
283
+ LOGI("DetectTtsModel: modelDir=%s, modelType=%s", modelDir.c_str(), modelType.c_str());
284
+
285
+ if (modelDir.empty()) {
286
+ result.error = "TTS: Model directory is empty";
287
+ LOGE("%s", result.error.c_str());
288
+ return result;
289
+ }
290
+
291
+ if (!FileExists(modelDir) || !IsDirectory(modelDir)) {
292
+ result.error = "TTS: Model directory does not exist or is not a directory: " + modelDir;
293
+ LOGE("%s", result.error.c_str());
294
+ return result;
295
+ }
296
+
297
+ const auto files = ListFilesRecursive(modelDir, 4);
298
+ LOGI("DetectTtsModel: Found %zu files in %s", files.size(), modelDir.c_str());
299
+ for (const auto& f : files) {
300
+ LOGI(" file: %s (size=%llu)", f.path.c_str(), (unsigned long long)f.size);
301
+ }
302
+
303
+ result = DetectTtsModelFromFiles(files, modelDir, modelType);
304
+ if (!result.ok) {
305
+ if (!result.error.empty()) LOGE("%s", result.error.c_str());
306
+ return result;
307
+ }
308
+ LOGI("DetectTtsModel: tokens=%s, lexicon=%s, dataDir=%s, voices=%s",
309
+ result.paths.tokens.c_str(), result.paths.lexicon.c_str(),
310
+ result.paths.dataDir.c_str(), result.paths.voices.c_str());
311
+ LOGI("DetectTtsModel: selected kind=%d, ttsModel=%s",
312
+ static_cast<int>(result.selectedKind), result.paths.ttsModel.c_str());
313
+ LOGI("DetectTtsModel: final paths — tokens=%s, dataDir=%s",
314
+ result.paths.tokens.c_str(), result.paths.dataDir.c_str());
234
315
  LOGI("DetectTtsModel: detection OK for %s", modelDir.c_str());
235
316
  return result;
236
317
  }
237
318
 
319
+ // Test-only: used by host-side model_detect_test; not used in production (Android/iOS use DetectTtsModel).
320
+ TtsDetectResult DetectTtsModelFromFileList(
321
+ const std::vector<model_detect::FileEntry>& files,
322
+ const std::string& modelDir,
323
+ const std::string& modelType
324
+ ) {
325
+ TtsDetectResult result;
326
+ if (modelDir.empty()) {
327
+ result.error = "TTS: Model directory is empty";
328
+ return result;
329
+ }
330
+ return DetectTtsModelFromFiles(files, modelDir, modelType);
331
+ }
332
+
238
333
  } // namespace sherpaonnx
@@ -2,6 +2,7 @@
2
2
  #define SHERPA_ONNX_MODEL_DETECT_H
3
3
 
4
4
  #include "sherpa-onnx-common.h"
5
+ #include "sherpa-onnx-model-detect-helper.h"
5
6
  #include <optional>
6
7
  #include <string>
7
8
  #include <vector>
@@ -21,6 +22,7 @@ enum class SttModelKind {
21
22
  kFunAsrNano,
22
23
  kFireRedAsr,
23
24
  kMoonshine,
25
+ kMoonshineV2,
24
26
  kDolphin,
25
27
  kCanary,
26
28
  kOmnilingual,
@@ -59,6 +61,8 @@ struct SttModelPaths {
59
61
  std::string moonshineEncoder;
60
62
  std::string moonshineUncachedDecoder;
61
63
  std::string moonshineCachedDecoder;
64
+ /** Moonshine v2: encoder + mergedDecoder (reuse moonshineEncoder for encoder path). */
65
+ std::string moonshineMergedDecoder;
62
66
  // Dolphin, Omnilingual, MedAsr, TeleSpeech (single model each)
63
67
  std::string dolphinModel;
64
68
  std::string omnilingualModel;
@@ -71,6 +75,69 @@ struct SttModelPaths {
71
75
  std::string canaryDecoder;
72
76
  };
73
77
 
78
+ /** All candidate paths gathered before model kind selection (used by STT detection steps). */
79
+ struct SttCandidatePaths {
80
+ std::string encoder;
81
+ std::string decoder;
82
+ std::string joiner;
83
+ std::string paraformerModel;
84
+ std::string ctcModel;
85
+ std::string tokens;
86
+ std::string bpeVocab;
87
+ std::string funasrEncoderAdaptor;
88
+ std::string funasrLLM;
89
+ std::string funasrEmbedding;
90
+ std::string funasrTokenizerDir;
91
+ std::string moonshinePreprocessor;
92
+ std::string moonshineEncoder;
93
+ std::string moonshineUncachedDecoder;
94
+ std::string moonshineCachedDecoder;
95
+ std::string moonshineMergedDecoder;
96
+ std::string encoderForV2;
97
+ };
98
+
99
+ /** Path hints derived from model directory name (isLikely* flags). */
100
+ struct SttPathHints {
101
+ bool isLikelyNemo = false;
102
+ bool isLikelyTdt = false;
103
+ bool isLikelyWenetCtc = false;
104
+ bool isLikelySenseVoice = false;
105
+ bool isLikelyFunAsrNano = false;
106
+ bool isLikelyZipformer = false;
107
+ bool isLikelyMoonshine = false;
108
+ bool isLikelyDolphin = false;
109
+ bool isLikelyFireRedAsr = false;
110
+ bool isLikelyCanary = false;
111
+ bool isLikelyOmnilingual = false;
112
+ bool isLikelyMedAsr = false;
113
+ bool isLikelyTeleSpeech = false;
114
+ bool isLikelyToneCtc = false;
115
+ bool isLikelyParaformer = false;
116
+ /** VAD (silero, ten-vad, etc.): not yet supported; when true, detection returns unsupported. */
117
+ bool isLikelyVad = false;
118
+ /** TDNN (keyword/yesno): not yet supported; when true, detection returns unsupported. */
119
+ bool isLikelyTdnn = false;
120
+ };
121
+
122
+ /** Which model types are possible given paths and hints (has* flags). */
123
+ struct SttCapabilities {
124
+ bool hasTransducer = false;
125
+ bool hasWhisper = false;
126
+ bool hasMoonshine = false;
127
+ bool hasMoonshineV2 = false;
128
+ bool hasParaformer = false;
129
+ bool hasFunAsrNano = false;
130
+ bool hasDolphin = false;
131
+ bool hasFireRedAsr = false;
132
+ /** True when dir name suggests Fire Red but only a single CTC/paraformer model (no encoder/decoder). Use zipformer_ctc. */
133
+ bool hasFireRedCtc = false;
134
+ bool hasCanary = false;
135
+ bool hasOmnilingual = false;
136
+ bool hasMedAsr = false;
137
+ bool hasTeleSpeechCtc = false;
138
+ bool hasToneCtc = false;
139
+ };
140
+
74
141
  struct TtsModelPaths {
75
142
  std::string ttsModel;
76
143
  std::string tokens;
@@ -92,6 +159,8 @@ struct TtsModelPaths {
92
159
  struct SttDetectResult {
93
160
  bool ok = false;
94
161
  std::string error;
162
+ /** True when detection failed because the model is for unsupported hardware (RK35xx, Ascend, CANN, etc.). */
163
+ bool isHardwareSpecificUnsupported = false;
95
164
  std::vector<DetectedModel> detectedModels;
96
165
  SttModelKind selectedKind = SttModelKind::kUnknown;
97
166
  bool tokensRequired = true;
@@ -113,11 +182,32 @@ SttDetectResult DetectSttModel(
113
182
  bool debug = false
114
183
  );
115
184
 
185
+ /** Test-only: Like DetectSttModel but takes a pre-built file list; no filesystem access.
186
+ * Only used by the host-side C++ test suite (test/cpp/model_detect_test.cpp). Not used in
187
+ * production (Android/iOS use DetectSttModel). Does not validate modelDir existence or
188
+ * call FileExists on tokens/bpeVocab. */
189
+ SttDetectResult DetectSttModelFromFileList(
190
+ const std::vector<model_detect::FileEntry>& files,
191
+ const std::string& modelDir,
192
+ const std::optional<bool>& preferInt8 = std::nullopt,
193
+ const std::optional<std::string>& modelType = std::nullopt
194
+ );
195
+
116
196
  TtsDetectResult DetectTtsModel(
117
197
  const std::string& modelDir,
118
198
  const std::string& modelType
119
199
  );
120
200
 
201
+ /** Test-only: Like DetectTtsModel but takes a pre-built file list; no filesystem access.
202
+ * Only used by the host-side C++ test suite (test/cpp/model_detect_test.cpp). Not used in
203
+ * production (Android/iOS use DetectTtsModel). Does not validate modelDir existence or
204
+ * call FileExists / IsDirectory. */
205
+ TtsDetectResult DetectTtsModelFromFileList(
206
+ const std::vector<model_detect::FileEntry>& files,
207
+ const std::string& modelDir,
208
+ const std::string& modelType = "auto"
209
+ );
210
+
121
211
  } // namespace sherpaonnx
122
212
 
123
213
  #endif // SHERPA_ONNX_MODEL_DETECT_H
@@ -25,6 +25,7 @@ const char* SttModelKindToString(SttModelKind k) {
25
25
  case SttModelKind::kFunAsrNano: return "funasr_nano";
26
26
  case SttModelKind::kFireRedAsr: return "fire_red_asr";
27
27
  case SttModelKind::kMoonshine: return "moonshine";
28
+ case SttModelKind::kMoonshineV2: return "moonshine_v2";
28
29
  case SttModelKind::kDolphin: return "dolphin";
29
30
  case SttModelKind::kCanary: return "canary";
30
31
  case SttModelKind::kOmnilingual: return "omnilingual";
@@ -52,6 +53,7 @@ jobject SttDetectResultToJava(JNIEnv* env, const SttDetectResult& result) {
52
53
 
53
54
  PutBoolean(env, map, mapPut, "success", result.ok);
54
55
  PutString(env, map, mapPut, "error", result.error);
56
+ PutBoolean(env, map, mapPut, "isHardwareSpecificUnsupported", result.isHardwareSpecificUnsupported);
55
57
  PutString(env, map, mapPut, "modelType", SttModelKindToString(result.selectedKind));
56
58
 
57
59
  jobject detectedList = BuildDetectedModelsList(env, result.detectedModels);
@@ -81,6 +83,7 @@ jobject SttDetectResultToJava(JNIEnv* env, const SttDetectResult& result) {
81
83
  PutString(env, pathsMap, mapPut, "moonshineEncoder", result.paths.moonshineEncoder);
82
84
  PutString(env, pathsMap, mapPut, "moonshineUncachedDecoder", result.paths.moonshineUncachedDecoder);
83
85
  PutString(env, pathsMap, mapPut, "moonshineCachedDecoder", result.paths.moonshineCachedDecoder);
86
+ PutString(env, pathsMap, mapPut, "moonshineMergedDecoder", result.paths.moonshineMergedDecoder);
84
87
  PutString(env, pathsMap, mapPut, "dolphinModel", result.paths.dolphinModel);
85
88
  PutString(env, pathsMap, mapPut, "omnilingualModel", result.paths.omnilingualModel);
86
89
  PutString(env, pathsMap, mapPut, "medasrModel", result.paths.medasrModel);
@@ -55,6 +55,7 @@ class SherpaOnnxModule(reactContext: ReactApplicationContext) :
55
55
  { instanceId, requestId, cancelled -> emitTtsStreamEnd(instanceId, requestId, cancelled) }
56
56
  )
57
57
  private val archiveHelper = SherpaOnnxArchiveHelper()
58
+ private var pcmCapture: SherpaOnnxPcmCapture? = null
58
59
 
59
60
  override fun getName(): String {
60
61
  return NAME
@@ -62,6 +63,8 @@ class SherpaOnnxModule(reactContext: ReactApplicationContext) :
62
63
 
63
64
  override fun onCatalystInstanceDestroy() {
64
65
  super.onCatalystInstanceDestroy()
66
+ pcmCapture?.stop()
67
+ pcmCapture = null
65
68
  onlineSttHelper.shutdown()
66
69
  ttsHelper.shutdown()
67
70
  }
@@ -319,12 +322,14 @@ class SherpaOnnxModule(reactContext: ReactApplicationContext) :
319
322
  return
320
323
  }
321
324
  val success = result["success"] as? Boolean ?: false
325
+ val isHardwareSpecificUnsupported = result["isHardwareSpecificUnsupported"] as? Boolean ?: false
322
326
  val detectedModels = result["detectedModels"] as? ArrayList<*>
323
327
  ?: arrayListOf<HashMap<String, String>>()
324
328
  val modelTypeStr = result["modelType"] as? String
325
329
 
326
330
  val resultMap = Arguments.createMap()
327
331
  resultMap.putBoolean("success", success)
332
+ resultMap.putBoolean("isHardwareSpecificUnsupported", isHardwareSpecificUnsupported)
328
333
  val modelsArray = Arguments.createArray()
329
334
  for (model in detectedModels) {
330
335
  val modelMap = model as? HashMap<*, *>
@@ -484,6 +489,71 @@ class SherpaOnnxModule(reactContext: ReactApplicationContext) :
484
489
  onlineSttHelper.processSttAudioChunk(streamId, samples, sampleRate.toInt(), promise)
485
490
  }
486
491
 
492
+ override fun startPcmLiveStream(options: ReadableMap, promise: Promise) {
493
+ try {
494
+ pcmCapture?.stop()
495
+ pcmCapture = null
496
+ val sampleRate = options.getDouble("sampleRate").toInt().takeIf { it > 0 } ?: 16000
497
+ val channelCount = if (options.hasKey("channelCount")) options.getDouble("channelCount").toInt().coerceIn(1, 2) else 1
498
+ val bufferSizeFrames = if (options.hasKey("bufferSizeFrames")) options.getDouble("bufferSizeFrames").toInt() else 0
499
+ var startError: String? = null
500
+ var started = false
501
+ val capture = SherpaOnnxPcmCapture(
502
+ targetSampleRate = sampleRate,
503
+ channelCount = channelCount,
504
+ bufferSizeFrames = bufferSizeFrames,
505
+ onChunk = { base64Pcm, sr -> emitPcmLiveStreamData(base64Pcm, sr) },
506
+ onError = { msg ->
507
+ if (!started) {
508
+ startError = msg
509
+ } else {
510
+ emitPcmLiveStreamError(msg)
511
+ }
512
+ },
513
+ logTag = NAME
514
+ )
515
+ pcmCapture = capture
516
+ capture.start()
517
+ started = true
518
+ val err = startError
519
+ if (err != null) {
520
+ promise.reject("PCM_LIVE_STREAM_ERROR", err)
521
+ } else {
522
+ promise.resolve(null)
523
+ }
524
+ } catch (e: Exception) {
525
+ android.util.Log.e(NAME, "startPcmLiveStream failed", e)
526
+ promise.reject("PCM_LIVE_STREAM_ERROR", e.message ?: "Failed to start PCM capture", e)
527
+ }
528
+ }
529
+
530
+ override fun stopPcmLiveStream(promise: Promise) {
531
+ try {
532
+ pcmCapture?.stop()
533
+ pcmCapture = null
534
+ promise.resolve(null)
535
+ } catch (e: Exception) {
536
+ promise.reject("PCM_LIVE_STREAM_ERROR", e.message ?: "Failed to stop PCM capture", e)
537
+ }
538
+ }
539
+
540
+ private fun emitPcmLiveStreamData(base64Pcm: String, sampleRate: Int) {
541
+ val eventEmitter = reactApplicationContext
542
+ .getJSModule(DeviceEventManagerModule.RCTDeviceEventEmitter::class.java)
543
+ val payload = Arguments.createMap()
544
+ payload.putString("base64Pcm", base64Pcm)
545
+ payload.putInt("sampleRate", sampleRate)
546
+ eventEmitter.emit("pcmLiveStreamData", payload)
547
+ }
548
+
549
+ private fun emitPcmLiveStreamError(message: String) {
550
+ val eventEmitter = reactApplicationContext
551
+ .getJSModule(DeviceEventManagerModule.RCTDeviceEventEmitter::class.java)
552
+ val payload = Arguments.createMap()
553
+ payload.putString("message", message)
554
+ eventEmitter.emit("pcmLiveStreamError", payload)
555
+ }
556
+
487
557
  // ==================== STT Methods ====================
488
558
 
489
559
  /**