react-native-sherpa-onnx 0.3.2 → 0.3.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +84 -77
- package/SherpaOnnx.podspec +79 -45
- package/android/build.gradle +8 -2
- package/android/prebuilt-download.gradle +70 -16
- package/android/prebuilt-versions.gradle +14 -6
- package/android/src/main/cpp/CMakeLists.txt +2 -0
- package/android/src/main/cpp/jni/audio/sherpa-onnx-audio-convert-jni.cpp +202 -328
- package/android/src/main/cpp/jni/model_detect/sherpa-onnx-detect-jni-common.cpp +22 -0
- package/android/src/main/cpp/jni/model_detect/sherpa-onnx-detect-jni-common.h +2 -0
- package/android/src/main/cpp/jni/model_detect/sherpa-onnx-model-detect-helper.cpp +96 -142
- package/android/src/main/cpp/jni/model_detect/sherpa-onnx-model-detect-helper.h +40 -4
- package/android/src/main/cpp/jni/model_detect/sherpa-onnx-model-detect-stt.cpp +774 -316
- package/android/src/main/cpp/jni/model_detect/sherpa-onnx-model-detect-tts.cpp +208 -122
- package/android/src/main/cpp/jni/model_detect/sherpa-onnx-model-detect.h +92 -0
- package/android/src/main/cpp/jni/model_detect/sherpa-onnx-stt-wrapper.cpp +3 -0
- package/android/src/main/cpp/jni/model_detect/sherpa-onnx-tts-wrapper.cpp +14 -2
- package/android/src/main/cpp/jni/model_detect/sherpa-onnx-validate-stt.cpp +229 -0
- package/android/src/main/cpp/jni/model_detect/sherpa-onnx-validate-stt.h +38 -0
- package/android/src/main/cpp/jni/model_detect/sherpa-onnx-validate-tts.cpp +144 -0
- package/android/src/main/cpp/jni/model_detect/sherpa-onnx-validate-tts.h +38 -0
- package/android/src/main/cpp/jni/module/sherpa-onnx-module-jni.cpp +1 -1
- package/android/src/main/java/com/sherpaonnx/SherpaOnnxModule.kt +157 -11
- package/android/src/main/java/com/sherpaonnx/SherpaOnnxPcmCapture.kt +150 -0
- package/android/src/main/java/com/sherpaonnx/SherpaOnnxSttHelper.kt +75 -24
- package/android/src/main/java/com/sherpaonnx/SherpaOnnxTtsHelper.kt +52 -1
- package/ios/SherpaOnnx+PcmLiveStream.mm +288 -0
- package/ios/SherpaOnnx+STT.mm +2 -0
- package/ios/SherpaOnnx+TTS.mm +17 -0
- package/ios/SherpaOnnx.mm +27 -3
- package/ios/SherpaOnnxAudioConvert.h +28 -0
- package/ios/SherpaOnnxAudioConvert.mm +698 -0
- package/ios/archive/sherpa-onnx-archive-helper.mm +12 -0
- package/ios/model_detect/sherpa-onnx-model-detect-helper.h +37 -3
- package/ios/model_detect/sherpa-onnx-model-detect-helper.mm +80 -45
- package/ios/model_detect/sherpa-onnx-model-detect-stt.mm +629 -267
- package/ios/model_detect/sherpa-onnx-model-detect-tts.mm +148 -56
- package/ios/model_detect/sherpa-onnx-model-detect.h +72 -0
- package/ios/model_detect/sherpa-onnx-validate-stt.h +38 -0
- package/ios/model_detect/sherpa-onnx-validate-stt.mm +229 -0
- package/ios/model_detect/sherpa-onnx-validate-tts.h +38 -0
- package/ios/model_detect/sherpa-onnx-validate-tts.mm +144 -0
- package/ios/stt/sherpa-onnx-stt-wrapper.mm +4 -0
- package/lib/module/NativeSherpaOnnx.js.map +1 -1
- package/lib/module/audio/index.js +55 -1
- package/lib/module/audio/index.js.map +1 -1
- package/lib/module/download/ModelDownloadManager.js +14 -0
- package/lib/module/download/ModelDownloadManager.js.map +1 -1
- package/lib/module/index.js +10 -0
- package/lib/module/index.js.map +1 -1
- package/lib/module/stt/streaming.js +6 -3
- package/lib/module/stt/streaming.js.map +1 -1
- package/lib/module/tts/index.js +13 -1
- package/lib/module/tts/index.js.map +1 -1
- package/lib/typescript/src/NativeSherpaOnnx.d.ts +32 -3
- package/lib/typescript/src/NativeSherpaOnnx.d.ts.map +1 -1
- package/lib/typescript/src/audio/index.d.ts +20 -1
- package/lib/typescript/src/audio/index.d.ts.map +1 -1
- package/lib/typescript/src/download/ModelDownloadManager.d.ts +2 -1
- package/lib/typescript/src/download/ModelDownloadManager.d.ts.map +1 -1
- package/lib/typescript/src/index.d.ts +10 -0
- package/lib/typescript/src/index.d.ts.map +1 -1
- package/lib/typescript/src/stt/streaming.d.ts.map +1 -1
- package/lib/typescript/src/stt/streamingTypes.d.ts +1 -1
- package/lib/typescript/src/stt/streamingTypes.d.ts.map +1 -1
- package/lib/typescript/src/tts/index.d.ts +12 -1
- package/lib/typescript/src/tts/index.d.ts.map +1 -1
- package/package.json +6 -1
- package/scripts/check-model-csvs.sh +72 -0
- package/scripts/setup-ios-framework.sh +272 -191
- package/src/NativeSherpaOnnx.ts +37 -3
- package/src/audio/index.ts +84 -1
- package/src/download/ModelDownloadManager.ts +19 -0
- package/src/index.tsx +15 -0
- package/src/stt/streaming.ts +10 -5
- package/src/stt/streamingTypes.ts +1 -1
- package/src/tts/index.ts +25 -1
- package/third_party/ffmpeg_prebuilt/ANDROID_RELEASE_TAG +1 -1
- package/third_party/libarchive_prebuilt/ANDROID_RELEASE_TAG +1 -1
- package/third_party/libarchive_prebuilt/IOS_RELEASE_TAG +1 -1
- package/third_party/sherpa-onnx-prebuilt/ANDROID_RELEASE_TAG +1 -1
- package/third_party/sherpa-onnx-prebuilt/IOS_RELEASE_TAG +1 -1
- package/ios/scripts/patch-libarchive-includes.sh +0 -61
- package/ios/scripts/setup-ios-libarchive.sh +0 -98
|
@@ -2,13 +2,42 @@
|
|
|
2
2
|
* sherpa-onnx-model-detect-tts.mm
|
|
3
3
|
*
|
|
4
4
|
* Purpose: Detects TTS (text-to-speech) model type and fills TtsModelPaths from a model directory.
|
|
5
|
-
* Supports Vits,
|
|
5
|
+
* Used by the TTS wrapper on iOS. Supports Vits, Matcha, Kokoro, Kitten, Pocket, Zipvoice.
|
|
6
|
+
*
|
|
7
|
+
* --- Detection pipeline (overview) ---
|
|
8
|
+
*
|
|
9
|
+
* 1. Gather files in modelDir (recursive), then map file names to logical paths (ttsModel,
|
|
10
|
+
* acousticModel, vocoder, encoder, decoder, lmFlow, lmMain, textConditioner, tokens, lexicon,
|
|
11
|
+
* dataDir, voices, vocabJson, tokenScoresJson). Path hints from directory name (isLikelyVits,
|
|
12
|
+
* isLikelyKitten, isLikelyKokoro).
|
|
13
|
+
*
|
|
14
|
+
* 2. Capabilities (hasVits, hasMatcha, hasPocket, hasZipvoice, hasVoicesFile, hasDataDir): which
|
|
15
|
+
* model types are *possible* given the paths. Multiple can be true (e.g. voices.bin can satisfy
|
|
16
|
+
* both Kokoro and Kitten).
|
|
17
|
+
*
|
|
18
|
+
* 3. detectedModels (for UI "Select model type"): built from capabilities only. Every kind with
|
|
19
|
+
* the corresponding has* == true is added (with existing rules: zipvoice only if !hasMatcha,
|
|
20
|
+
* vits when hasVits and no voices or ambiguous folder name).
|
|
21
|
+
*
|
|
22
|
+
* 4. selectedKind: from ResolveTtsKind(). If modelType is explicit, use it if capabilities allow.
|
|
23
|
+
* If modelType == "auto": Priority 1 = folder name (GetKindsFromDirNameTts: tokens like "vits",
|
|
24
|
+
* "matcha", "kokoro" in dir name --> candidate kinds). Priority 2 = among those candidates, pick
|
|
25
|
+
* the first that CapabilitySupportsTtsKind(). Fallback = file-only order (matcha --> pocket -->
|
|
26
|
+
* zipvoice --> kokoro/kitten --> vits).
|
|
27
|
+
*
|
|
28
|
+
* 5. paths: all gathered paths are written into result.paths; the selected kind determines which
|
|
29
|
+
* engine is used at runtime.
|
|
30
|
+
*
|
|
31
|
+
* Result to caller: ok, error, detectedModels (list), selectedKind (single), paths.
|
|
6
32
|
*/
|
|
7
33
|
|
|
8
34
|
#include "sherpa-onnx-model-detect.h"
|
|
9
35
|
#include "sherpa-onnx-model-detect-helper.h"
|
|
36
|
+
#include "sherpa-onnx-validate-tts.h"
|
|
10
37
|
|
|
38
|
+
#include <algorithm>
|
|
11
39
|
#include <string>
|
|
40
|
+
#include <vector>
|
|
12
41
|
|
|
13
42
|
namespace sherpaonnx {
|
|
14
43
|
namespace {
|
|
@@ -25,6 +54,61 @@ TtsModelKind ParseTtsModelType(const std::string& modelType) {
|
|
|
25
54
|
return TtsModelKind::kUnknown;
|
|
26
55
|
}
|
|
27
56
|
|
|
57
|
+
/** Returns true if the given kind is supported by the current paths and hints (required files present).
|
|
58
|
+
* data_dir (espeak-ng-data) is required only for Kitten and Kokoro (sherpa-onnx config Validate());
|
|
59
|
+
* VITS, Matcha, Zipvoice use it optionally; Pocket does not use it. */
|
|
60
|
+
static bool CapabilitySupportsTtsKind(
|
|
61
|
+
TtsModelKind kind,
|
|
62
|
+
bool hasVits,
|
|
63
|
+
bool hasMatcha,
|
|
64
|
+
bool hasPocket,
|
|
65
|
+
bool hasZipvoice,
|
|
66
|
+
bool hasVoicesFile,
|
|
67
|
+
bool hasDataDir
|
|
68
|
+
) {
|
|
69
|
+
switch (kind) {
|
|
70
|
+
case TtsModelKind::kVits:
|
|
71
|
+
return hasVits;
|
|
72
|
+
case TtsModelKind::kMatcha:
|
|
73
|
+
return hasMatcha;
|
|
74
|
+
case TtsModelKind::kKokoro:
|
|
75
|
+
case TtsModelKind::kKitten:
|
|
76
|
+
return hasVoicesFile && hasDataDir;
|
|
77
|
+
case TtsModelKind::kPocket:
|
|
78
|
+
return hasPocket;
|
|
79
|
+
case TtsModelKind::kZipvoice:
|
|
80
|
+
return hasZipvoice;
|
|
81
|
+
default:
|
|
82
|
+
return false;
|
|
83
|
+
}
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
/**
|
|
87
|
+
* Priority 1: Collect candidate TTS kinds from the model directory name (last path component).
|
|
88
|
+
* Tokens like "vits", "matcha", "kokoro" are matched case-insensitively. Returns candidates in a
|
|
89
|
+
* fixed priority order for file-based disambiguation when multiple names match.
|
|
90
|
+
*/
|
|
91
|
+
static std::vector<TtsModelKind> GetKindsFromDirNameTts(const std::string& modelDir) {
|
|
92
|
+
size_t pos = modelDir.find_last_of("/\\");
|
|
93
|
+
std::string base = (pos == std::string::npos) ? modelDir : modelDir.substr(pos + 1);
|
|
94
|
+
std::string lower = ToLower(base);
|
|
95
|
+
|
|
96
|
+
std::vector<TtsModelKind> out;
|
|
97
|
+
auto add = [&out](TtsModelKind k) {
|
|
98
|
+
if (std::find(out.begin(), out.end(), k) == out.end())
|
|
99
|
+
out.push_back(k);
|
|
100
|
+
};
|
|
101
|
+
|
|
102
|
+
if (lower.find("matcha") != std::string::npos) add(TtsModelKind::kMatcha);
|
|
103
|
+
if (lower.find("pocket") != std::string::npos) add(TtsModelKind::kPocket);
|
|
104
|
+
if (lower.find("zipvoice") != std::string::npos) add(TtsModelKind::kZipvoice);
|
|
105
|
+
if (lower.find("kokoro") != std::string::npos) add(TtsModelKind::kKokoro);
|
|
106
|
+
if (lower.find("kitten") != std::string::npos) add(TtsModelKind::kKitten);
|
|
107
|
+
if (lower.find("vits") != std::string::npos) add(TtsModelKind::kVits);
|
|
108
|
+
|
|
109
|
+
return out;
|
|
110
|
+
}
|
|
111
|
+
|
|
28
112
|
} // namespace
|
|
29
113
|
|
|
30
114
|
TtsDetectResult DetectTtsModel(const std::string& modelDir, const std::string& modelType) {
|
|
@@ -45,10 +129,10 @@ TtsDetectResult DetectTtsModel(const std::string& modelDir, const std::string& m
|
|
|
45
129
|
const int kMaxSearchDepth = 4;
|
|
46
130
|
const std::vector<FileEntry> files = ListFilesRecursive(modelDir, kMaxSearchDepth);
|
|
47
131
|
|
|
48
|
-
std::string tokensFile = FindFileByName(
|
|
49
|
-
std::
|
|
50
|
-
std::string dataDirPath =
|
|
51
|
-
std::string voicesFile = FindFileByName(
|
|
132
|
+
std::string tokensFile = FindFileByName(files, "tokens.txt");
|
|
133
|
+
std::vector<LexiconCandidate> lexiconCandidates = FindLexiconCandidates(files, modelDir);
|
|
134
|
+
std::string dataDirPath = FindDirectoryUnderRoot(files, modelDir, "espeak-ng-data");
|
|
135
|
+
std::string voicesFile = FindFileByName(files, "voices.bin");
|
|
52
136
|
|
|
53
137
|
std::string acousticModel = FindOnnxByAnyToken(files, {"acoustic_model", "acoustic-model"}, std::nullopt);
|
|
54
138
|
std::string vocoder = FindOnnxByAnyToken(files, {"vocoder", "vocos"}, std::nullopt);
|
|
@@ -57,8 +141,8 @@ TtsDetectResult DetectTtsModel(const std::string& modelDir, const std::string& m
|
|
|
57
141
|
std::string lmFlow = FindOnnxByAnyToken(files, {"lm_flow", "lm-flow"}, std::nullopt);
|
|
58
142
|
std::string lmMain = FindOnnxByAnyToken(files, {"lm_main", "lm-main"}, std::nullopt);
|
|
59
143
|
std::string textConditioner = FindOnnxByAnyToken(files, {"text_conditioner", "text-conditioner"}, std::nullopt);
|
|
60
|
-
std::string vocabJsonFile = FindFileByName(
|
|
61
|
-
std::string tokenScoresJsonFile = FindFileByName(
|
|
144
|
+
std::string vocabJsonFile = FindFileByName(files, "vocab.json");
|
|
145
|
+
std::string tokenScoresJsonFile = FindFileByName(files, "token_scores.json");
|
|
62
146
|
|
|
63
147
|
std::vector<std::string> modelExcludes = {"acoustic", "vocoder", "encoder", "decoder", "joiner"};
|
|
64
148
|
std::string ttsModel = FindOnnxByAnyToken(files, {"model"}, std::nullopt);
|
|
@@ -67,15 +151,24 @@ TtsDetectResult DetectTtsModel(const std::string& modelDir, const std::string& m
|
|
|
67
151
|
}
|
|
68
152
|
|
|
69
153
|
bool hasVits = !ttsModel.empty();
|
|
70
|
-
|
|
71
|
-
bool
|
|
154
|
+
std::string modelDirLower = ToLower(modelDir);
|
|
155
|
+
bool isLikelyMatcha = modelDirLower.find("matcha") != std::string::npos;
|
|
156
|
+
bool hasMatcha = (!acousticModel.empty() && !vocoder.empty())
|
|
157
|
+
|| (isLikelyMatcha && !ttsModel.empty() && !tokensFile.empty());
|
|
158
|
+
if (hasMatcha && acousticModel.empty())
|
|
159
|
+
acousticModel = ttsModel; // single-file Matcha: model.onnx is the acoustic model
|
|
160
|
+
bool hasVoicesFile = !voicesFile.empty();
|
|
161
|
+
bool isLikelyZipvoice = modelDirLower.find("zipvoice") != std::string::npos;
|
|
72
162
|
bool hasZipvoice = !encoder.empty() && !decoder.empty() && !vocoder.empty();
|
|
163
|
+
if (isLikelyZipvoice && !encoder.empty() && !decoder.empty() && vocoder.empty()) {
|
|
164
|
+
result.ok = false;
|
|
165
|
+
result.error = "TTS: Zipvoice distill variant (no vocoder) is not supported. Use a full Zipvoice model with vocoder or add vocos_24khz.onnx separately.";
|
|
166
|
+
return result;
|
|
167
|
+
}
|
|
73
168
|
bool hasPocket = !lmFlow.empty() && !lmMain.empty() && !encoder.empty() && !decoder.empty() &&
|
|
74
|
-
!textConditioner.empty() && !vocabJsonFile.empty() &&
|
|
75
|
-
|
|
76
|
-
bool hasDataDir = !dataDirPath.empty() && IsDirectory(dataDirPath);
|
|
169
|
+
!textConditioner.empty() && !vocabJsonFile.empty() && !tokenScoresJsonFile.empty();
|
|
170
|
+
bool hasDataDir = !dataDirPath.empty();
|
|
77
171
|
|
|
78
|
-
std::string modelDirLower = ToLower(modelDir);
|
|
79
172
|
bool isLikelyKitten = modelDirLower.find("kitten") != std::string::npos;
|
|
80
173
|
bool isLikelyKokoro = modelDirLower.find("kokoro") != std::string::npos;
|
|
81
174
|
|
|
@@ -120,22 +213,36 @@ TtsDetectResult DetectTtsModel(const std::string& modelDir, const std::string& m
|
|
|
120
213
|
return result;
|
|
121
214
|
}
|
|
122
215
|
} else {
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
}
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
216
|
+
// Auto: Priority 1 – folder name candidates; Priority 2 – file-based disambiguation.
|
|
217
|
+
std::vector<TtsModelKind> nameCandidates = GetKindsFromDirNameTts(modelDir);
|
|
218
|
+
if (!nameCandidates.empty()) {
|
|
219
|
+
for (TtsModelKind k : nameCandidates) {
|
|
220
|
+
if (CapabilitySupportsTtsKind(k, hasVits, hasMatcha, hasPocket, hasZipvoice,
|
|
221
|
+
hasVoicesFile, hasDataDir)) {
|
|
222
|
+
selected = k;
|
|
223
|
+
break;
|
|
224
|
+
}
|
|
225
|
+
}
|
|
226
|
+
}
|
|
227
|
+
// Fallback: no name-based candidates or none supported – use file-only order.
|
|
228
|
+
if (selected == TtsModelKind::kUnknown) {
|
|
229
|
+
if (hasMatcha) {
|
|
230
|
+
selected = TtsModelKind::kMatcha;
|
|
231
|
+
} else if (hasPocket) {
|
|
232
|
+
selected = TtsModelKind::kPocket;
|
|
233
|
+
} else if (hasZipvoice) {
|
|
234
|
+
selected = TtsModelKind::kZipvoice;
|
|
235
|
+
} else if (hasVoicesFile) {
|
|
236
|
+
if (isLikelyKitten && !isLikelyKokoro) {
|
|
237
|
+
selected = TtsModelKind::kKitten;
|
|
238
|
+
} else if (isLikelyKokoro && !isLikelyKitten) {
|
|
239
|
+
selected = TtsModelKind::kKokoro;
|
|
240
|
+
} else {
|
|
241
|
+
selected = TtsModelKind::kKokoro;
|
|
242
|
+
}
|
|
243
|
+
} else if (hasVits) {
|
|
244
|
+
selected = TtsModelKind::kVits;
|
|
136
245
|
}
|
|
137
|
-
} else if (hasVits) {
|
|
138
|
-
selected = TtsModelKind::kVits;
|
|
139
246
|
}
|
|
140
247
|
}
|
|
141
248
|
|
|
@@ -144,39 +251,22 @@ TtsDetectResult DetectTtsModel(const std::string& modelDir, const std::string& m
|
|
|
144
251
|
return result;
|
|
145
252
|
}
|
|
146
253
|
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
254
|
+
std::string lexiconPath;
|
|
255
|
+
for (const auto& c : lexiconCandidates) {
|
|
256
|
+
result.lexiconLanguageCandidates.push_back(c.languageId);
|
|
150
257
|
}
|
|
151
|
-
if (
|
|
152
|
-
|
|
153
|
-
return result;
|
|
258
|
+
if (!lexiconCandidates.empty()) {
|
|
259
|
+
lexiconPath = lexiconCandidates[0].path;
|
|
154
260
|
}
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
}
|
|
159
|
-
if (selected == TtsModelKind::kPocket && !hasPocket) {
|
|
160
|
-
result.error = "TTS: Pocket model requested but required files not found in " + modelDir;
|
|
161
|
-
return result;
|
|
162
|
-
}
|
|
163
|
-
if (selected == TtsModelKind::kZipvoice && !hasZipvoice) {
|
|
164
|
-
result.error = "TTS: Zipvoice model requested but required files not found in " + modelDir;
|
|
165
|
-
return result;
|
|
166
|
-
}
|
|
167
|
-
if ((selected == TtsModelKind::kVits || selected == TtsModelKind::kMatcha ||
|
|
168
|
-
selected == TtsModelKind::kKokoro || selected == TtsModelKind::kKitten ||
|
|
169
|
-
selected == TtsModelKind::kZipvoice) &&
|
|
170
|
-
!hasDataDir) {
|
|
171
|
-
result.error = "TTS: espeak-ng-data not found in " + modelDir +
|
|
172
|
-
". Copy espeak-ng-data into the model directory.";
|
|
173
|
-
return result;
|
|
261
|
+
|
|
262
|
+
if (selected == TtsModelKind::kMatcha && !acousticModel.empty() && vocoder.empty()) {
|
|
263
|
+
vocoder = acousticModel;
|
|
174
264
|
}
|
|
175
265
|
|
|
176
266
|
result.selectedKind = selected;
|
|
177
267
|
result.paths.ttsModel = ttsModel;
|
|
178
268
|
result.paths.tokens = tokensFile;
|
|
179
|
-
result.paths.lexicon =
|
|
269
|
+
result.paths.lexicon = lexiconPath;
|
|
180
270
|
result.paths.dataDir = dataDirPath;
|
|
181
271
|
result.paths.voices = voicesFile;
|
|
182
272
|
result.paths.acousticModel = acousticModel;
|
|
@@ -189,8 +279,10 @@ TtsDetectResult DetectTtsModel(const std::string& modelDir, const std::string& m
|
|
|
189
279
|
result.paths.vocabJson = vocabJsonFile;
|
|
190
280
|
result.paths.tokenScoresJson = tokenScoresJsonFile;
|
|
191
281
|
|
|
192
|
-
|
|
193
|
-
|
|
282
|
+
auto validation = ValidateTtsPaths(selected, result.paths, modelDir);
|
|
283
|
+
if (!validation.ok) {
|
|
284
|
+
result.ok = false;
|
|
285
|
+
result.error = validation.error;
|
|
194
286
|
return result;
|
|
195
287
|
}
|
|
196
288
|
|
|
@@ -21,6 +21,7 @@ enum class SttModelKind {
|
|
|
21
21
|
kFunAsrNano,
|
|
22
22
|
kFireRedAsr,
|
|
23
23
|
kMoonshine,
|
|
24
|
+
kMoonshineV2,
|
|
24
25
|
kDolphin,
|
|
25
26
|
kCanary,
|
|
26
27
|
kOmnilingual,
|
|
@@ -48,6 +49,8 @@ struct SttModelPaths {
|
|
|
48
49
|
std::string whisperEncoder;
|
|
49
50
|
std::string whisperDecoder;
|
|
50
51
|
std::string tokens;
|
|
52
|
+
/** BPE vocabulary for hotwords tokenization (sentencepiece export bpe.vocab). Optional. */
|
|
53
|
+
std::string bpeVocab;
|
|
51
54
|
std::string funasrEncoderAdaptor;
|
|
52
55
|
std::string funasrLLM;
|
|
53
56
|
std::string funasrEmbedding;
|
|
@@ -56,6 +59,8 @@ struct SttModelPaths {
|
|
|
56
59
|
std::string moonshineEncoder;
|
|
57
60
|
std::string moonshineUncachedDecoder;
|
|
58
61
|
std::string moonshineCachedDecoder;
|
|
62
|
+
/** Moonshine v2: encoder + mergedDecoder (reuse moonshineEncoder for encoder path). */
|
|
63
|
+
std::string moonshineMergedDecoder;
|
|
59
64
|
std::string dolphinModel;
|
|
60
65
|
std::string omnilingualModel;
|
|
61
66
|
std::string medasrModel;
|
|
@@ -66,6 +71,69 @@ struct SttModelPaths {
|
|
|
66
71
|
std::string canaryDecoder;
|
|
67
72
|
};
|
|
68
73
|
|
|
74
|
+
/** All candidate paths gathered before model kind selection (used by STT detection steps). */
|
|
75
|
+
struct SttCandidatePaths {
|
|
76
|
+
std::string encoder;
|
|
77
|
+
std::string decoder;
|
|
78
|
+
std::string joiner;
|
|
79
|
+
std::string paraformerModel;
|
|
80
|
+
std::string ctcModel;
|
|
81
|
+
std::string tokens;
|
|
82
|
+
std::string bpeVocab;
|
|
83
|
+
std::string funasrEncoderAdaptor;
|
|
84
|
+
std::string funasrLLM;
|
|
85
|
+
std::string funasrEmbedding;
|
|
86
|
+
std::string funasrTokenizerDir;
|
|
87
|
+
std::string moonshinePreprocessor;
|
|
88
|
+
std::string moonshineEncoder;
|
|
89
|
+
std::string moonshineUncachedDecoder;
|
|
90
|
+
std::string moonshineCachedDecoder;
|
|
91
|
+
std::string moonshineMergedDecoder;
|
|
92
|
+
std::string encoderForV2;
|
|
93
|
+
};
|
|
94
|
+
|
|
95
|
+
/** Path hints derived from model directory name (isLikely* flags). */
|
|
96
|
+
struct SttPathHints {
|
|
97
|
+
bool isLikelyNemo = false;
|
|
98
|
+
bool isLikelyTdt = false;
|
|
99
|
+
bool isLikelyWenetCtc = false;
|
|
100
|
+
bool isLikelySenseVoice = false;
|
|
101
|
+
bool isLikelyFunAsrNano = false;
|
|
102
|
+
bool isLikelyZipformer = false;
|
|
103
|
+
bool isLikelyMoonshine = false;
|
|
104
|
+
bool isLikelyDolphin = false;
|
|
105
|
+
bool isLikelyFireRedAsr = false;
|
|
106
|
+
bool isLikelyCanary = false;
|
|
107
|
+
bool isLikelyOmnilingual = false;
|
|
108
|
+
bool isLikelyMedAsr = false;
|
|
109
|
+
bool isLikelyTeleSpeech = false;
|
|
110
|
+
bool isLikelyToneCtc = false;
|
|
111
|
+
bool isLikelyParaformer = false;
|
|
112
|
+
/** VAD (silero, ten-vad, etc.): not yet supported; when true, detection returns unsupported. */
|
|
113
|
+
bool isLikelyVad = false;
|
|
114
|
+
/** TDNN (keyword/yesno): not yet supported; when true, detection returns unsupported. */
|
|
115
|
+
bool isLikelyTdnn = false;
|
|
116
|
+
};
|
|
117
|
+
|
|
118
|
+
/** Which model types are possible given paths and hints (has* flags). */
|
|
119
|
+
struct SttCapabilities {
|
|
120
|
+
bool hasTransducer = false;
|
|
121
|
+
bool hasWhisper = false;
|
|
122
|
+
bool hasMoonshine = false;
|
|
123
|
+
bool hasMoonshineV2 = false;
|
|
124
|
+
bool hasParaformer = false;
|
|
125
|
+
bool hasFunAsrNano = false;
|
|
126
|
+
bool hasDolphin = false;
|
|
127
|
+
bool hasFireRedAsr = false;
|
|
128
|
+
/** True when dir name suggests Fire Red but only a single CTC/paraformer model (no encoder/decoder). Use zipformer_ctc. */
|
|
129
|
+
bool hasFireRedCtc = false;
|
|
130
|
+
bool hasCanary = false;
|
|
131
|
+
bool hasOmnilingual = false;
|
|
132
|
+
bool hasMedAsr = false;
|
|
133
|
+
bool hasTeleSpeechCtc = false;
|
|
134
|
+
bool hasToneCtc = false;
|
|
135
|
+
};
|
|
136
|
+
|
|
69
137
|
struct TtsModelPaths {
|
|
70
138
|
std::string ttsModel;
|
|
71
139
|
std::string tokens;
|
|
@@ -87,6 +155,8 @@ struct TtsModelPaths {
|
|
|
87
155
|
struct SttDetectResult {
|
|
88
156
|
bool ok = false;
|
|
89
157
|
std::string error;
|
|
158
|
+
/** True when detection failed because the model is for unsupported hardware (RK35xx, Ascend, CANN, etc.). */
|
|
159
|
+
bool isHardwareSpecificUnsupported = false;
|
|
90
160
|
std::vector<DetectedModel> detectedModels;
|
|
91
161
|
SttModelKind selectedKind = SttModelKind::kUnknown;
|
|
92
162
|
bool tokensRequired = true;
|
|
@@ -99,6 +169,8 @@ struct TtsDetectResult {
|
|
|
99
169
|
std::vector<DetectedModel> detectedModels;
|
|
100
170
|
TtsModelKind selectedKind = TtsModelKind::kUnknown;
|
|
101
171
|
TtsModelPaths paths;
|
|
172
|
+
/** Language ids from detected lexicon files (e.g. "default", "us-en", "zh") for multi-lang Kokoro/Kitten. Empty when not applicable. */
|
|
173
|
+
std::vector<std::string> lexiconLanguageCandidates;
|
|
102
174
|
};
|
|
103
175
|
|
|
104
176
|
SttDetectResult DetectSttModel(
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* sherpa-onnx-validate-stt.h
|
|
3
|
+
*
|
|
4
|
+
* Declares ValidateSttPaths(): after model detection resolves a kind and populates
|
|
5
|
+
* SttModelPaths, this function checks that every *required* path field for that kind
|
|
6
|
+
* is non-empty. Returns a validation result with ok/error and the list of missing
|
|
7
|
+
* fields so the caller can surface a specific error instead of crashing at init time.
|
|
8
|
+
*/
|
|
9
|
+
#ifndef SHERPA_ONNX_VALIDATE_STT_H
|
|
10
|
+
#define SHERPA_ONNX_VALIDATE_STT_H
|
|
11
|
+
|
|
12
|
+
#include "sherpa-onnx-model-detect.h"
|
|
13
|
+
#include <string>
|
|
14
|
+
#include <vector>
|
|
15
|
+
|
|
16
|
+
namespace sherpaonnx {
|
|
17
|
+
|
|
18
|
+
struct SttFieldRequirement {
|
|
19
|
+
const char* fieldName;
|
|
20
|
+
std::string SttModelPaths::* field;
|
|
21
|
+
bool required;
|
|
22
|
+
};
|
|
23
|
+
|
|
24
|
+
struct SttValidationResult {
|
|
25
|
+
bool ok = true;
|
|
26
|
+
std::vector<std::string> missingRequired;
|
|
27
|
+
std::string error;
|
|
28
|
+
};
|
|
29
|
+
|
|
30
|
+
SttValidationResult ValidateSttPaths(
|
|
31
|
+
SttModelKind kind,
|
|
32
|
+
const SttModelPaths& paths,
|
|
33
|
+
const std::string& modelDir
|
|
34
|
+
);
|
|
35
|
+
|
|
36
|
+
} // namespace sherpaonnx
|
|
37
|
+
|
|
38
|
+
#endif // SHERPA_ONNX_VALIDATE_STT_H
|
|
@@ -0,0 +1,229 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* sherpa-onnx-validate-stt.mm
|
|
3
|
+
*
|
|
4
|
+
* Validates that all required file paths are set for a given SttModelKind.
|
|
5
|
+
* Requirements are declared in static tables at the top of this file —
|
|
6
|
+
* edit them when adding a new model type or changing what is required.
|
|
7
|
+
*/
|
|
8
|
+
#include "sherpa-onnx-validate-stt.h"
|
|
9
|
+
#include <cstddef>
|
|
10
|
+
#include <cstring>
|
|
11
|
+
|
|
12
|
+
namespace sherpaonnx {
|
|
13
|
+
namespace {
|
|
14
|
+
|
|
15
|
+
// ============================================================
|
|
16
|
+
// REQUIREMENT TABLES — one entry per SttModelKind (or group).
|
|
17
|
+
// Edit here when adding a new model type or changing requirements.
|
|
18
|
+
// ============================================================
|
|
19
|
+
|
|
20
|
+
static const SttFieldRequirement kTransducerReqs[] = {
|
|
21
|
+
{"encoder", &SttModelPaths::encoder, true},
|
|
22
|
+
{"decoder", &SttModelPaths::decoder, true},
|
|
23
|
+
{"joiner", &SttModelPaths::joiner, true},
|
|
24
|
+
{"tokens", &SttModelPaths::tokens, true},
|
|
25
|
+
{"bpeVocab", &SttModelPaths::bpeVocab, false},
|
|
26
|
+
};
|
|
27
|
+
|
|
28
|
+
// Offline paraformer uses paraformerModel; streaming paraformer uses encoder+decoder.
|
|
29
|
+
// Both are valid — validated via custom logic in ValidateSttPaths, not via this table.
|
|
30
|
+
static const SttFieldRequirement kParaformerReqs[] = {
|
|
31
|
+
{"paraformerModel", &SttModelPaths::paraformerModel, false},
|
|
32
|
+
{"encoder", &SttModelPaths::encoder, false},
|
|
33
|
+
{"decoder", &SttModelPaths::decoder, false},
|
|
34
|
+
{"tokens", &SttModelPaths::tokens, true},
|
|
35
|
+
};
|
|
36
|
+
|
|
37
|
+
static const SttFieldRequirement kCtcReqs[] = {
|
|
38
|
+
{"ctcModel", &SttModelPaths::ctcModel, true},
|
|
39
|
+
{"tokens", &SttModelPaths::tokens, true},
|
|
40
|
+
};
|
|
41
|
+
|
|
42
|
+
static const SttFieldRequirement kWhisperReqs[] = {
|
|
43
|
+
{"whisperEncoder", &SttModelPaths::whisperEncoder, true},
|
|
44
|
+
{"whisperDecoder", &SttModelPaths::whisperDecoder, true},
|
|
45
|
+
{"tokens", &SttModelPaths::tokens, true},
|
|
46
|
+
};
|
|
47
|
+
|
|
48
|
+
static const SttFieldRequirement kFunAsrNanoReqs[] = {
|
|
49
|
+
{"funasrEncoderAdaptor", &SttModelPaths::funasrEncoderAdaptor, true},
|
|
50
|
+
{"funasrLLM", &SttModelPaths::funasrLLM, true},
|
|
51
|
+
{"funasrEmbedding", &SttModelPaths::funasrEmbedding, true},
|
|
52
|
+
{"funasrTokenizer", &SttModelPaths::funasrTokenizer, true},
|
|
53
|
+
};
|
|
54
|
+
|
|
55
|
+
static const SttFieldRequirement kMoonshineReqs[] = {
|
|
56
|
+
{"moonshinePreprocessor", &SttModelPaths::moonshinePreprocessor, true},
|
|
57
|
+
{"moonshineEncoder", &SttModelPaths::moonshineEncoder, true},
|
|
58
|
+
{"moonshineUncachedDecoder", &SttModelPaths::moonshineUncachedDecoder, true},
|
|
59
|
+
{"moonshineCachedDecoder", &SttModelPaths::moonshineCachedDecoder, true},
|
|
60
|
+
};
|
|
61
|
+
|
|
62
|
+
static const SttFieldRequirement kMoonshineV2Reqs[] = {
|
|
63
|
+
{"moonshineEncoder", &SttModelPaths::moonshineEncoder, true},
|
|
64
|
+
{"moonshineMergedDecoder", &SttModelPaths::moonshineMergedDecoder, true},
|
|
65
|
+
};
|
|
66
|
+
|
|
67
|
+
static const SttFieldRequirement kFireRedReqs[] = {
|
|
68
|
+
{"fireRedEncoder", &SttModelPaths::fireRedEncoder, true},
|
|
69
|
+
{"fireRedDecoder", &SttModelPaths::fireRedDecoder, true},
|
|
70
|
+
{"tokens", &SttModelPaths::tokens, true},
|
|
71
|
+
};
|
|
72
|
+
|
|
73
|
+
static const SttFieldRequirement kCanaryReqs[] = {
|
|
74
|
+
{"canaryEncoder", &SttModelPaths::canaryEncoder, true},
|
|
75
|
+
{"canaryDecoder", &SttModelPaths::canaryDecoder, true},
|
|
76
|
+
{"tokens", &SttModelPaths::tokens, true},
|
|
77
|
+
};
|
|
78
|
+
|
|
79
|
+
static const SttFieldRequirement kDolphinReqs[] = {
|
|
80
|
+
{"dolphinModel", &SttModelPaths::dolphinModel, true},
|
|
81
|
+
{"tokens", &SttModelPaths::tokens, true},
|
|
82
|
+
};
|
|
83
|
+
|
|
84
|
+
static const SttFieldRequirement kOmnilingualReqs[] = {
|
|
85
|
+
{"omnilingualModel", &SttModelPaths::omnilingualModel, true},
|
|
86
|
+
{"tokens", &SttModelPaths::tokens, true},
|
|
87
|
+
};
|
|
88
|
+
|
|
89
|
+
static const SttFieldRequirement kMedAsrReqs[] = {
|
|
90
|
+
{"medasrModel", &SttModelPaths::medasrModel, true},
|
|
91
|
+
{"tokens", &SttModelPaths::tokens, true},
|
|
92
|
+
};
|
|
93
|
+
|
|
94
|
+
static const SttFieldRequirement kTeleSpeechReqs[] = {
|
|
95
|
+
{"telespeechCtcModel", &SttModelPaths::telespeechCtcModel, true},
|
|
96
|
+
{"tokens", &SttModelPaths::tokens, true},
|
|
97
|
+
};
|
|
98
|
+
|
|
99
|
+
// ============================================================
|
|
100
|
+
|
|
101
|
+
static const SttFieldRequirement* GetRequirements(SttModelKind kind, size_t& count) {
|
|
102
|
+
switch (kind) {
|
|
103
|
+
case SttModelKind::kTransducer:
|
|
104
|
+
case SttModelKind::kNemoTransducer:
|
|
105
|
+
count = std::size(kTransducerReqs);
|
|
106
|
+
return kTransducerReqs;
|
|
107
|
+
case SttModelKind::kParaformer:
|
|
108
|
+
count = std::size(kParaformerReqs);
|
|
109
|
+
return kParaformerReqs;
|
|
110
|
+
case SttModelKind::kNemoCtc:
|
|
111
|
+
case SttModelKind::kWenetCtc:
|
|
112
|
+
case SttModelKind::kSenseVoice:
|
|
113
|
+
case SttModelKind::kZipformerCtc:
|
|
114
|
+
case SttModelKind::kToneCtc:
|
|
115
|
+
count = std::size(kCtcReqs);
|
|
116
|
+
return kCtcReqs;
|
|
117
|
+
case SttModelKind::kWhisper:
|
|
118
|
+
count = std::size(kWhisperReqs);
|
|
119
|
+
return kWhisperReqs;
|
|
120
|
+
case SttModelKind::kFunAsrNano:
|
|
121
|
+
count = std::size(kFunAsrNanoReqs);
|
|
122
|
+
return kFunAsrNanoReqs;
|
|
123
|
+
case SttModelKind::kMoonshine:
|
|
124
|
+
count = std::size(kMoonshineReqs);
|
|
125
|
+
return kMoonshineReqs;
|
|
126
|
+
case SttModelKind::kMoonshineV2:
|
|
127
|
+
count = std::size(kMoonshineV2Reqs);
|
|
128
|
+
return kMoonshineV2Reqs;
|
|
129
|
+
case SttModelKind::kFireRedAsr:
|
|
130
|
+
count = std::size(kFireRedReqs);
|
|
131
|
+
return kFireRedReqs;
|
|
132
|
+
case SttModelKind::kCanary:
|
|
133
|
+
count = std::size(kCanaryReqs);
|
|
134
|
+
return kCanaryReqs;
|
|
135
|
+
case SttModelKind::kDolphin:
|
|
136
|
+
count = std::size(kDolphinReqs);
|
|
137
|
+
return kDolphinReqs;
|
|
138
|
+
case SttModelKind::kOmnilingual:
|
|
139
|
+
count = std::size(kOmnilingualReqs);
|
|
140
|
+
return kOmnilingualReqs;
|
|
141
|
+
case SttModelKind::kMedAsr:
|
|
142
|
+
count = std::size(kMedAsrReqs);
|
|
143
|
+
return kMedAsrReqs;
|
|
144
|
+
case SttModelKind::kTeleSpeechCtc:
|
|
145
|
+
count = std::size(kTeleSpeechReqs);
|
|
146
|
+
return kTeleSpeechReqs;
|
|
147
|
+
default:
|
|
148
|
+
count = 0;
|
|
149
|
+
return nullptr;
|
|
150
|
+
}
|
|
151
|
+
}
|
|
152
|
+
|
|
153
|
+
static const char* SttKindToName(SttModelKind k) {
|
|
154
|
+
switch (k) {
|
|
155
|
+
case SttModelKind::kTransducer: return "Transducer";
|
|
156
|
+
case SttModelKind::kNemoTransducer: return "NeMo Transducer";
|
|
157
|
+
case SttModelKind::kParaformer: return "Paraformer";
|
|
158
|
+
case SttModelKind::kNemoCtc: return "NeMo CTC";
|
|
159
|
+
case SttModelKind::kWenetCtc: return "WeNet CTC";
|
|
160
|
+
case SttModelKind::kSenseVoice: return "SenseVoice";
|
|
161
|
+
case SttModelKind::kZipformerCtc: return "Zipformer CTC";
|
|
162
|
+
case SttModelKind::kWhisper: return "Whisper";
|
|
163
|
+
case SttModelKind::kFunAsrNano: return "FunASR Nano";
|
|
164
|
+
case SttModelKind::kFireRedAsr: return "Fire Red ASR";
|
|
165
|
+
case SttModelKind::kMoonshine: return "Moonshine";
|
|
166
|
+
case SttModelKind::kMoonshineV2: return "Moonshine v2";
|
|
167
|
+
case SttModelKind::kDolphin: return "Dolphin";
|
|
168
|
+
case SttModelKind::kCanary: return "Canary";
|
|
169
|
+
case SttModelKind::kOmnilingual: return "Omnilingual";
|
|
170
|
+
case SttModelKind::kMedAsr: return "MedASR";
|
|
171
|
+
case SttModelKind::kTeleSpeechCtc: return "TeleSpeech CTC";
|
|
172
|
+
case SttModelKind::kToneCtc: return "Tone CTC";
|
|
173
|
+
default: return "Unknown";
|
|
174
|
+
}
|
|
175
|
+
}
|
|
176
|
+
|
|
177
|
+
static const char* GetFieldHint(const char* fieldName) {
|
|
178
|
+
if (std::strcmp(fieldName, "tokens") == 0)
|
|
179
|
+
return "Ensure tokens.txt is present in the model directory.";
|
|
180
|
+
return nullptr;
|
|
181
|
+
}
|
|
182
|
+
|
|
183
|
+
} // namespace
|
|
184
|
+
|
|
185
|
+
SttValidationResult ValidateSttPaths(
|
|
186
|
+
SttModelKind kind,
|
|
187
|
+
const SttModelPaths& paths,
|
|
188
|
+
const std::string& modelDir
|
|
189
|
+
) {
|
|
190
|
+
SttValidationResult result;
|
|
191
|
+
size_t count = 0;
|
|
192
|
+
const auto* reqs = GetRequirements(kind, count);
|
|
193
|
+
if (!reqs) return result;
|
|
194
|
+
|
|
195
|
+
for (size_t i = 0; i < count; ++i) {
|
|
196
|
+
if (reqs[i].required && (paths.*(reqs[i].field)).empty()) {
|
|
197
|
+
result.missingRequired.push_back(reqs[i].fieldName);
|
|
198
|
+
}
|
|
199
|
+
}
|
|
200
|
+
|
|
201
|
+
// Paraformer: offline uses paraformerModel, streaming uses encoder+decoder.
|
|
202
|
+
// At least one variant must be present.
|
|
203
|
+
if (kind == SttModelKind::kParaformer) {
|
|
204
|
+
bool hasOffline = !paths.paraformerModel.empty();
|
|
205
|
+
bool hasStreaming = !paths.encoder.empty() && !paths.decoder.empty();
|
|
206
|
+
if (!hasOffline && !hasStreaming) {
|
|
207
|
+
result.missingRequired.push_back("paraformerModel (or encoder+decoder for streaming)");
|
|
208
|
+
}
|
|
209
|
+
}
|
|
210
|
+
|
|
211
|
+
if (!result.missingRequired.empty()) {
|
|
212
|
+
result.ok = false;
|
|
213
|
+
result.error = std::string("STT ") + SttKindToName(kind)
|
|
214
|
+
+ ": missing required files in " + modelDir + ": ";
|
|
215
|
+
for (size_t i = 0; i < result.missingRequired.size(); ++i) {
|
|
216
|
+
if (i > 0) result.error += ", ";
|
|
217
|
+
result.error += result.missingRequired[i];
|
|
218
|
+
const char* hint = GetFieldHint(result.missingRequired[i].c_str());
|
|
219
|
+
if (hint) {
|
|
220
|
+
result.error += " (";
|
|
221
|
+
result.error += hint;
|
|
222
|
+
result.error += ")";
|
|
223
|
+
}
|
|
224
|
+
}
|
|
225
|
+
}
|
|
226
|
+
return result;
|
|
227
|
+
}
|
|
228
|
+
|
|
229
|
+
} // namespace sherpaonnx
|