react-native-sherpa-onnx 0.3.2 → 0.3.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +28 -15
- package/SherpaOnnx.podspec +13 -5
- package/android/prebuilt-download.gradle +18 -5
- package/android/prebuilt-versions.gradle +8 -4
- package/android/src/main/cpp/jni/model_detect/sherpa-onnx-model-detect-helper.cpp +43 -142
- package/android/src/main/cpp/jni/model_detect/sherpa-onnx-model-detect-helper.h +12 -4
- package/android/src/main/cpp/jni/model_detect/sherpa-onnx-model-detect-stt.cpp +694 -307
- package/android/src/main/cpp/jni/model_detect/sherpa-onnx-model-detect-tts.cpp +194 -99
- package/android/src/main/cpp/jni/model_detect/sherpa-onnx-model-detect.h +90 -0
- package/android/src/main/cpp/jni/model_detect/sherpa-onnx-stt-wrapper.cpp +3 -0
- package/android/src/main/java/com/sherpaonnx/SherpaOnnxModule.kt +70 -0
- package/android/src/main/java/com/sherpaonnx/SherpaOnnxPcmCapture.kt +150 -0
- package/android/src/main/java/com/sherpaonnx/SherpaOnnxSttHelper.kt +39 -19
- package/ios/SherpaOnnx+PcmLiveStream.mm +288 -0
- package/ios/SherpaOnnx+STT.mm +2 -0
- package/ios/SherpaOnnx.mm +1 -1
- package/ios/model_detect/sherpa-onnx-model-detect-helper.h +9 -3
- package/ios/model_detect/sherpa-onnx-model-detect-helper.mm +38 -54
- package/ios/model_detect/sherpa-onnx-model-detect-stt.mm +620 -267
- package/ios/model_detect/sherpa-onnx-model-detect-tts.mm +131 -28
- package/ios/model_detect/sherpa-onnx-model-detect.h +70 -0
- package/ios/stt/sherpa-onnx-stt-wrapper.mm +4 -0
- package/lib/module/NativeSherpaOnnx.js.map +1 -1
- package/lib/module/audio/index.js +52 -0
- package/lib/module/audio/index.js.map +1 -1
- package/lib/module/stt/streaming.js +6 -3
- package/lib/module/stt/streaming.js.map +1 -1
- package/lib/typescript/src/NativeSherpaOnnx.d.ts +16 -2
- package/lib/typescript/src/NativeSherpaOnnx.d.ts.map +1 -1
- package/lib/typescript/src/audio/index.d.ts +17 -0
- package/lib/typescript/src/audio/index.d.ts.map +1 -1
- package/lib/typescript/src/stt/streaming.d.ts.map +1 -1
- package/lib/typescript/src/stt/streamingTypes.d.ts +1 -1
- package/lib/typescript/src/stt/streamingTypes.d.ts.map +1 -1
- package/package.json +6 -1
- package/scripts/check-model-csvs.sh +72 -0
- package/scripts/setup-ios-framework.sh +48 -48
- package/src/NativeSherpaOnnx.ts +18 -2
- package/src/audio/index.ts +81 -0
- package/src/stt/streaming.ts +10 -5
- package/src/stt/streamingTypes.ts +1 -1
- package/third_party/sherpa-onnx-prebuilt/ANDROID_RELEASE_TAG +1 -1
- package/third_party/sherpa-onnx-prebuilt/IOS_RELEASE_TAG +1 -1
|
@@ -1,16 +1,49 @@
|
|
|
1
1
|
/**
|
|
2
2
|
* sherpa-onnx-model-detect-tts.cpp
|
|
3
3
|
*
|
|
4
|
-
* Purpose: Detects TTS model type and fills TtsModelPaths from a model directory.
|
|
5
|
-
*
|
|
4
|
+
* Purpose: Detects TTS model type and fills TtsModelPaths from a model directory. Used by
|
|
5
|
+
* nativeDetectTtsModel (module-jni). Supports Vits, Matcha, Kokoro, Kitten, Pocket, Zipvoice.
|
|
6
|
+
*
|
|
7
|
+
* --- Detection pipeline (overview) ---
|
|
8
|
+
*
|
|
9
|
+
* 1. Gather files in modelDir (recursive), then map file names to logical paths (ttsModel,
|
|
10
|
+
* acousticModel, vocoder, encoder, decoder, lmFlow, lmMain, textConditioner, tokens, lexicon,
|
|
11
|
+
* dataDir, voices, vocabJson, tokenScoresJson). Path hints from directory name (isLikelyVits,
|
|
12
|
+
* isLikelyKitten, isLikelyKokoro).
|
|
13
|
+
*
|
|
14
|
+
* 2. Capabilities (hasVits, hasMatcha, hasPocket, hasZipvoice, hasVoicesFile, hasDataDir): which
|
|
15
|
+
* model types are *possible* given the paths. Multiple can be true (e.g. voices.bin can satisfy
|
|
16
|
+
* both Kokoro and Kitten).
|
|
17
|
+
*
|
|
18
|
+
* 3. detectedModels (for UI "Select model type"): built from capabilities only. Every kind with
|
|
19
|
+
* the corresponding has* == true is added (with existing rules: zipvoice only if !hasMatcha,
|
|
20
|
+
* vits when hasVits and no voices or ambiguous folder name).
|
|
21
|
+
*
|
|
22
|
+
* 4. selectedKind: from ResolveTtsKind(). If modelType is explicit, use it if capabilities allow.
|
|
23
|
+
* If modelType == "auto": Priority 1 = folder name (GetKindsFromDirNameTts: tokens like "vits",
|
|
24
|
+
* "matcha", "kokoro" in dir name → candidate kinds). Priority 2 = among those candidates, pick
|
|
25
|
+
* the first that CapabilitySupportsTtsKind(). Fallback = file-only order (matcha → pocket →
|
|
26
|
+
* zipvoice → kokoro/kitten → vits).
|
|
27
|
+
*
|
|
28
|
+
* 5. paths: all gathered paths are written into result.paths; the selected kind determines which
|
|
29
|
+
* engine is used at runtime.
|
|
30
|
+
*
|
|
31
|
+
* Result to caller: ok, error, detectedModels (list), selectedKind (single), paths.
|
|
6
32
|
*/
|
|
7
33
|
#include "sherpa-onnx-model-detect.h"
|
|
8
34
|
#include "sherpa-onnx-model-detect-helper.h"
|
|
35
|
+
#include <algorithm>
|
|
36
|
+
#include <string>
|
|
37
|
+
#include <vector>
|
|
38
|
+
#ifdef __ANDROID__
|
|
9
39
|
#include <android/log.h>
|
|
10
|
-
|
|
11
40
|
#define LOG_TAG "TtsModelDetect"
|
|
12
41
|
#define LOGI(...) __android_log_print(ANDROID_LOG_INFO, LOG_TAG, __VA_ARGS__)
|
|
13
42
|
#define LOGE(...) __android_log_print(ANDROID_LOG_ERROR, LOG_TAG, __VA_ARGS__)
|
|
43
|
+
#else
|
|
44
|
+
#define LOGI(...) ((void)0)
|
|
45
|
+
#define LOGE(...) ((void)0)
|
|
46
|
+
#endif
|
|
14
47
|
|
|
15
48
|
namespace sherpaonnx {
|
|
16
49
|
namespace {
|
|
@@ -25,94 +58,117 @@ TtsModelKind ParseTtsModelType(const std::string& modelType) {
|
|
|
25
58
|
return TtsModelKind::kUnknown;
|
|
26
59
|
}
|
|
27
60
|
|
|
28
|
-
|
|
61
|
+
/** Returns true if the given kind is supported by the current paths and hints (required files present). */
|
|
62
|
+
static bool CapabilitySupportsTtsKind(
|
|
63
|
+
TtsModelKind kind,
|
|
64
|
+
bool hasVits,
|
|
65
|
+
bool hasMatcha,
|
|
66
|
+
bool hasPocket,
|
|
67
|
+
bool hasZipvoice,
|
|
68
|
+
bool hasVoicesFile,
|
|
69
|
+
bool hasDataDir
|
|
70
|
+
) {
|
|
71
|
+
switch (kind) {
|
|
72
|
+
case TtsModelKind::kVits:
|
|
73
|
+
return hasVits && hasDataDir;
|
|
74
|
+
case TtsModelKind::kMatcha:
|
|
75
|
+
return hasMatcha && hasDataDir;
|
|
76
|
+
case TtsModelKind::kKokoro:
|
|
77
|
+
case TtsModelKind::kKitten:
|
|
78
|
+
return hasVoicesFile && hasDataDir;
|
|
79
|
+
case TtsModelKind::kPocket:
|
|
80
|
+
return hasPocket;
|
|
81
|
+
case TtsModelKind::kZipvoice:
|
|
82
|
+
return hasZipvoice;
|
|
83
|
+
default:
|
|
84
|
+
return false;
|
|
85
|
+
}
|
|
86
|
+
}
|
|
29
87
|
|
|
30
|
-
|
|
88
|
+
/**
|
|
89
|
+
* Priority 1: Collect candidate TTS kinds from the model directory name (last path component).
|
|
90
|
+
* Tokens like "vits", "matcha", "kokoro" are matched case-insensitively. Returns candidates in a
|
|
91
|
+
* fixed priority order for file-based disambiguation when multiple names match.
|
|
92
|
+
*/
|
|
93
|
+
static std::vector<TtsModelKind> GetKindsFromDirNameTts(const std::string& modelDir) {
|
|
31
94
|
using namespace model_detect;
|
|
95
|
+
size_t pos = modelDir.find_last_of("/\\");
|
|
96
|
+
std::string base = (pos == std::string::npos) ? modelDir : modelDir.substr(pos + 1);
|
|
97
|
+
std::string lower = ToLower(base);
|
|
32
98
|
|
|
33
|
-
|
|
99
|
+
std::vector<TtsModelKind> out;
|
|
100
|
+
auto add = [&out](TtsModelKind k) {
|
|
101
|
+
if (std::find(out.begin(), out.end(), k) == out.end())
|
|
102
|
+
out.push_back(k);
|
|
103
|
+
};
|
|
34
104
|
|
|
35
|
-
|
|
105
|
+
if (lower.find("matcha") != std::string::npos) add(TtsModelKind::kMatcha);
|
|
106
|
+
if (lower.find("pocket") != std::string::npos) add(TtsModelKind::kPocket);
|
|
107
|
+
if (lower.find("zipvoice") != std::string::npos) add(TtsModelKind::kZipvoice);
|
|
108
|
+
if (lower.find("kokoro") != std::string::npos) add(TtsModelKind::kKokoro);
|
|
109
|
+
if (lower.find("kitten") != std::string::npos) add(TtsModelKind::kKitten);
|
|
110
|
+
if (lower.find("vits") != std::string::npos) add(TtsModelKind::kVits);
|
|
36
111
|
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
LOGE("%s", result.error.c_str());
|
|
40
|
-
return result;
|
|
41
|
-
}
|
|
42
|
-
|
|
43
|
-
if (!FileExists(modelDir) || !IsDirectory(modelDir)) {
|
|
44
|
-
result.error = "TTS: Model directory does not exist or is not a directory: " + modelDir;
|
|
45
|
-
LOGE("%s", result.error.c_str());
|
|
46
|
-
return result;
|
|
47
|
-
}
|
|
112
|
+
return out;
|
|
113
|
+
}
|
|
48
114
|
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
115
|
+
/** Shared detection logic: runs on a pre-built file list. No filesystem access, no logging. */
|
|
116
|
+
static TtsDetectResult DetectTtsModelFromFiles(
|
|
117
|
+
const std::vector<model_detect::FileEntry>& files,
|
|
118
|
+
const std::string& modelDir,
|
|
119
|
+
const std::string& modelType
|
|
120
|
+
) {
|
|
121
|
+
using namespace model_detect;
|
|
54
122
|
|
|
55
|
-
|
|
56
|
-
std::string lexiconFile = FindFileByName(modelDir, "lexicon.txt", 2);
|
|
57
|
-
std::string dataDirPath = FindDirectoryByName(modelDir, "espeak-ng-data", 2);
|
|
58
|
-
std::string voicesFile = FindFileByName(modelDir, "voices.bin", 2);
|
|
123
|
+
TtsDetectResult result;
|
|
59
124
|
|
|
60
|
-
|
|
61
|
-
|
|
125
|
+
std::string tokensFile = FindFileByName(files, "tokens.txt");
|
|
126
|
+
std::string lexiconFile = FindFileByName(files, "lexicon.txt");
|
|
127
|
+
std::string dataDirPath;
|
|
128
|
+
{
|
|
129
|
+
const std::string prefix = modelDir + "/espeak-ng-data/";
|
|
130
|
+
for (const auto& entry : files) {
|
|
131
|
+
if (entry.path.size() > prefix.size() && entry.path.compare(0, prefix.size(), prefix) == 0) {
|
|
132
|
+
dataDirPath = modelDir + "/espeak-ng-data";
|
|
133
|
+
break;
|
|
134
|
+
}
|
|
135
|
+
}
|
|
136
|
+
}
|
|
137
|
+
std::string voicesFile = FindFileByName(files, "voices.bin");
|
|
62
138
|
|
|
63
139
|
std::string acousticModel = FindOnnxByAnyToken(files, {"acoustic_model", "acoustic-model"}, std::nullopt);
|
|
64
|
-
// Note: matches either a "vocoder" or "vocos" ONNX file; both are stored in this field.
|
|
65
140
|
std::string vocoder = FindOnnxByAnyToken(files, {"vocoder", "vocos"}, std::nullopt);
|
|
66
141
|
std::string encoder = FindOnnxByAnyToken(files, {"encoder"}, std::nullopt);
|
|
67
142
|
std::string decoder = FindOnnxByAnyToken(files, {"decoder"}, std::nullopt);
|
|
68
143
|
std::string lmFlow = FindOnnxByAnyToken(files, {"lm_flow", "lm-flow"}, std::nullopt);
|
|
69
144
|
std::string lmMain = FindOnnxByAnyToken(files, {"lm_main", "lm-main"}, std::nullopt);
|
|
70
145
|
std::string textConditioner = FindOnnxByAnyToken(files, {"text_conditioner", "text-conditioner"}, std::nullopt);
|
|
71
|
-
std::string vocabJsonFile = FindFileByName(
|
|
72
|
-
std::string tokenScoresJsonFile = FindFileByName(
|
|
73
|
-
|
|
74
|
-
LOGI("DetectTtsModel: acousticModel=%s, vocoder=%s, encoder=%s, decoder=%s",
|
|
75
|
-
acousticModel.c_str(), vocoder.c_str(), encoder.c_str(), decoder.c_str());
|
|
76
|
-
LOGI("DetectTtsModel: lmFlow=%s, lmMain=%s, textConditioner=%s, vocabJson=%s, tokenScoresJson=%s",
|
|
77
|
-
lmFlow.c_str(), lmMain.c_str(), textConditioner.c_str(), vocabJsonFile.c_str(), tokenScoresJsonFile.c_str());
|
|
146
|
+
std::string vocabJsonFile = FindFileByName(files, "vocab.json");
|
|
147
|
+
std::string tokenScoresJsonFile = FindFileByName(files, "token_scores.json");
|
|
78
148
|
|
|
79
149
|
std::vector<std::string> modelExcludes = {
|
|
80
|
-
"acoustic",
|
|
81
|
-
"vocoder",
|
|
82
|
-
"encoder",
|
|
83
|
-
"decoder",
|
|
84
|
-
"joiner"
|
|
150
|
+
"acoustic", "vocoder", "encoder", "decoder", "joiner"
|
|
85
151
|
};
|
|
86
|
-
|
|
87
152
|
std::string ttsModel = FindOnnxByAnyToken(files, {"model"}, std::nullopt);
|
|
88
153
|
if (ttsModel.empty()) {
|
|
89
154
|
ttsModel = FindLargestOnnxExcludingTokens(files, modelExcludes);
|
|
90
155
|
}
|
|
91
|
-
LOGI("DetectTtsModel: ttsModel=%s", ttsModel.c_str());
|
|
92
156
|
|
|
93
157
|
bool hasVits = !ttsModel.empty();
|
|
94
158
|
bool hasMatcha = !acousticModel.empty() && !vocoder.empty();
|
|
95
|
-
bool hasVoicesFile = !voicesFile.empty()
|
|
96
|
-
// Zipvoice requires encoder + decoder + vocoder (full model). Distill variants (no vocoder) are not supported by the native layer.
|
|
159
|
+
bool hasVoicesFile = !voicesFile.empty();
|
|
97
160
|
bool hasZipvoice = !encoder.empty() && !decoder.empty() && !vocoder.empty();
|
|
98
161
|
bool hasPocket = !lmFlow.empty() && !lmMain.empty() && !encoder.empty() && !decoder.empty() &&
|
|
99
|
-
!textConditioner.empty() && !vocabJsonFile.empty() &&
|
|
100
|
-
|
|
101
|
-
bool hasDataDir = !dataDirPath.empty() && IsDirectory(dataDirPath);
|
|
162
|
+
!textConditioner.empty() && !vocabJsonFile.empty() && !tokenScoresJsonFile.empty();
|
|
163
|
+
bool hasDataDir = !dataDirPath.empty();
|
|
102
164
|
|
|
103
165
|
std::string modelDirLower = ToLower(modelDir);
|
|
104
166
|
bool isLikelyKitten = modelDirLower.find("kitten") != std::string::npos;
|
|
105
167
|
bool isLikelyKokoro = modelDirLower.find("kokoro") != std::string::npos;
|
|
106
168
|
|
|
107
|
-
if (hasMatcha) {
|
|
108
|
-
|
|
109
|
-
}
|
|
110
|
-
if (hasPocket) {
|
|
111
|
-
result.detectedModels.push_back({"pocket", modelDir});
|
|
112
|
-
}
|
|
113
|
-
if (hasZipvoice && !hasMatcha) {
|
|
114
|
-
result.detectedModels.push_back({"zipvoice", modelDir});
|
|
115
|
-
}
|
|
169
|
+
if (hasMatcha) result.detectedModels.push_back({"matcha", modelDir});
|
|
170
|
+
if (hasPocket) result.detectedModels.push_back({"pocket", modelDir});
|
|
171
|
+
if (hasZipvoice && !hasMatcha) result.detectedModels.push_back({"zipvoice", modelDir});
|
|
116
172
|
if (hasVoicesFile) {
|
|
117
173
|
if (isLikelyKitten && !isLikelyKokoro) {
|
|
118
174
|
result.detectedModels.push_back({"kitten", modelDir});
|
|
@@ -123,23 +179,11 @@ TtsDetectResult DetectTtsModel(const std::string& modelDir, const std::string& m
|
|
|
123
179
|
result.detectedModels.push_back({"kitten", modelDir});
|
|
124
180
|
}
|
|
125
181
|
}
|
|
126
|
-
|
|
127
182
|
if (hasVits) {
|
|
128
183
|
bool isLikelyVits = modelDirLower.find("vits") != std::string::npos;
|
|
129
184
|
bool voicesAmbiguous = !isLikelyKitten && !isLikelyKokoro;
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
if (!hasVoicesFile) {
|
|
133
|
-
addVits = true;
|
|
134
|
-
} else {
|
|
135
|
-
if (isLikelyVits || voicesAmbiguous) {
|
|
136
|
-
addVits = true;
|
|
137
|
-
}
|
|
138
|
-
}
|
|
139
|
-
|
|
140
|
-
if (addVits) {
|
|
141
|
-
result.detectedModels.push_back({"vits", modelDir});
|
|
142
|
-
}
|
|
185
|
+
bool addVits = !hasVoicesFile || isLikelyVits || voicesAmbiguous;
|
|
186
|
+
if (addVits) result.detectedModels.push_back({"vits", modelDir});
|
|
143
187
|
}
|
|
144
188
|
|
|
145
189
|
TtsModelKind selected = TtsModelKind::kUnknown;
|
|
@@ -150,22 +194,25 @@ TtsDetectResult DetectTtsModel(const std::string& modelDir, const std::string& m
|
|
|
150
194
|
return result;
|
|
151
195
|
}
|
|
152
196
|
} else {
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
selected = TtsModelKind::kKitten;
|
|
162
|
-
} else if (isLikelyKokoro && !isLikelyKitten) {
|
|
163
|
-
selected = TtsModelKind::kKokoro;
|
|
164
|
-
} else {
|
|
165
|
-
selected = TtsModelKind::kKokoro;
|
|
197
|
+
std::vector<TtsModelKind> nameCandidates = GetKindsFromDirNameTts(modelDir);
|
|
198
|
+
if (!nameCandidates.empty()) {
|
|
199
|
+
for (TtsModelKind k : nameCandidates) {
|
|
200
|
+
if (CapabilitySupportsTtsKind(k, hasVits, hasMatcha, hasPocket, hasZipvoice,
|
|
201
|
+
hasVoicesFile, hasDataDir)) {
|
|
202
|
+
selected = k;
|
|
203
|
+
break;
|
|
204
|
+
}
|
|
166
205
|
}
|
|
167
|
-
}
|
|
168
|
-
|
|
206
|
+
}
|
|
207
|
+
if (selected == TtsModelKind::kUnknown) {
|
|
208
|
+
if (hasMatcha) selected = TtsModelKind::kMatcha;
|
|
209
|
+
else if (hasPocket) selected = TtsModelKind::kPocket;
|
|
210
|
+
else if (hasZipvoice) selected = TtsModelKind::kZipvoice;
|
|
211
|
+
else if (hasVoicesFile) {
|
|
212
|
+
if (isLikelyKitten && !isLikelyKokoro) selected = TtsModelKind::kKitten;
|
|
213
|
+
else if (isLikelyKokoro && !isLikelyKitten) selected = TtsModelKind::kKokoro;
|
|
214
|
+
else selected = TtsModelKind::kKokoro;
|
|
215
|
+
} else if (hasVits) selected = TtsModelKind::kVits;
|
|
169
216
|
}
|
|
170
217
|
}
|
|
171
218
|
|
|
@@ -173,7 +220,6 @@ TtsDetectResult DetectTtsModel(const std::string& modelDir, const std::string& m
|
|
|
173
220
|
result.error = "TTS: No compatible model type detected in " + modelDir;
|
|
174
221
|
return result;
|
|
175
222
|
}
|
|
176
|
-
|
|
177
223
|
if (selected == TtsModelKind::kVits && !hasVits) {
|
|
178
224
|
result.error = "TTS: VITS model requested but model file not found in " + modelDir;
|
|
179
225
|
return result;
|
|
@@ -182,7 +228,7 @@ TtsDetectResult DetectTtsModel(const std::string& modelDir, const std::string& m
|
|
|
182
228
|
result.error = "TTS: Matcha model requested but required files not found in " + modelDir;
|
|
183
229
|
return result;
|
|
184
230
|
}
|
|
185
|
-
if ((selected == TtsModelKind::kKokoro || selected == TtsModelKind::kKitten) && (!
|
|
231
|
+
if ((selected == TtsModelKind::kKokoro || selected == TtsModelKind::kKitten) && (!hasVoicesFile || !hasDataDir)) {
|
|
186
232
|
result.error = "TTS: Kokoro/Kitten model requested but required files not found in " + modelDir;
|
|
187
233
|
return result;
|
|
188
234
|
}
|
|
@@ -196,8 +242,7 @@ TtsDetectResult DetectTtsModel(const std::string& modelDir, const std::string& m
|
|
|
196
242
|
}
|
|
197
243
|
if ((selected == TtsModelKind::kVits || selected == TtsModelKind::kMatcha ||
|
|
198
244
|
selected == TtsModelKind::kKokoro || selected == TtsModelKind::kKitten ||
|
|
199
|
-
selected == TtsModelKind::kZipvoice) &&
|
|
200
|
-
!hasDataDir) {
|
|
245
|
+
selected == TtsModelKind::kZipvoice) && !hasDataDir) {
|
|
201
246
|
result.error = "TTS: espeak-ng-data not found in " + modelDir +
|
|
202
247
|
". Copy espeak-ng-data into the model directory.";
|
|
203
248
|
return result;
|
|
@@ -206,7 +251,7 @@ TtsDetectResult DetectTtsModel(const std::string& modelDir, const std::string& m
|
|
|
206
251
|
result.selectedKind = selected;
|
|
207
252
|
result.paths.ttsModel = ttsModel;
|
|
208
253
|
result.paths.tokens = tokensFile;
|
|
209
|
-
result.paths.lexicon = !lexiconFile.empty()
|
|
254
|
+
result.paths.lexicon = !lexiconFile.empty() ? lexiconFile : "";
|
|
210
255
|
result.paths.dataDir = dataDirPath;
|
|
211
256
|
result.paths.voices = voicesFile;
|
|
212
257
|
result.paths.acousticModel = acousticModel;
|
|
@@ -219,20 +264,70 @@ TtsDetectResult DetectTtsModel(const std::string& modelDir, const std::string& m
|
|
|
219
264
|
result.paths.vocabJson = vocabJsonFile;
|
|
220
265
|
result.paths.tokenScoresJson = tokenScoresJsonFile;
|
|
221
266
|
|
|
222
|
-
|
|
223
|
-
static_cast<int>(selected), ttsModel.c_str());
|
|
224
|
-
LOGI("DetectTtsModel: final paths — tokens=%s, dataDir=%s",
|
|
225
|
-
result.paths.tokens.c_str(), result.paths.dataDir.c_str());
|
|
226
|
-
|
|
227
|
-
if (selected != TtsModelKind::kPocket && (tokensFile.empty() || !FileExists(tokensFile))) {
|
|
267
|
+
if (selected != TtsModelKind::kPocket && tokensFile.empty()) {
|
|
228
268
|
result.error = "TTS: tokens.txt not found in " + modelDir;
|
|
229
|
-
LOGE("%s", result.error.c_str());
|
|
230
269
|
return result;
|
|
231
270
|
}
|
|
232
271
|
|
|
233
272
|
result.ok = true;
|
|
273
|
+
return result;
|
|
274
|
+
}
|
|
275
|
+
|
|
276
|
+
} // namespace
|
|
277
|
+
|
|
278
|
+
TtsDetectResult DetectTtsModel(const std::string& modelDir, const std::string& modelType) {
|
|
279
|
+
using namespace model_detect;
|
|
280
|
+
|
|
281
|
+
TtsDetectResult result;
|
|
282
|
+
|
|
283
|
+
LOGI("DetectTtsModel: modelDir=%s, modelType=%s", modelDir.c_str(), modelType.c_str());
|
|
284
|
+
|
|
285
|
+
if (modelDir.empty()) {
|
|
286
|
+
result.error = "TTS: Model directory is empty";
|
|
287
|
+
LOGE("%s", result.error.c_str());
|
|
288
|
+
return result;
|
|
289
|
+
}
|
|
290
|
+
|
|
291
|
+
if (!FileExists(modelDir) || !IsDirectory(modelDir)) {
|
|
292
|
+
result.error = "TTS: Model directory does not exist or is not a directory: " + modelDir;
|
|
293
|
+
LOGE("%s", result.error.c_str());
|
|
294
|
+
return result;
|
|
295
|
+
}
|
|
296
|
+
|
|
297
|
+
const auto files = ListFilesRecursive(modelDir, 4);
|
|
298
|
+
LOGI("DetectTtsModel: Found %zu files in %s", files.size(), modelDir.c_str());
|
|
299
|
+
for (const auto& f : files) {
|
|
300
|
+
LOGI(" file: %s (size=%llu)", f.path.c_str(), (unsigned long long)f.size);
|
|
301
|
+
}
|
|
302
|
+
|
|
303
|
+
result = DetectTtsModelFromFiles(files, modelDir, modelType);
|
|
304
|
+
if (!result.ok) {
|
|
305
|
+
if (!result.error.empty()) LOGE("%s", result.error.c_str());
|
|
306
|
+
return result;
|
|
307
|
+
}
|
|
308
|
+
LOGI("DetectTtsModel: tokens=%s, lexicon=%s, dataDir=%s, voices=%s",
|
|
309
|
+
result.paths.tokens.c_str(), result.paths.lexicon.c_str(),
|
|
310
|
+
result.paths.dataDir.c_str(), result.paths.voices.c_str());
|
|
311
|
+
LOGI("DetectTtsModel: selected kind=%d, ttsModel=%s",
|
|
312
|
+
static_cast<int>(result.selectedKind), result.paths.ttsModel.c_str());
|
|
313
|
+
LOGI("DetectTtsModel: final paths — tokens=%s, dataDir=%s",
|
|
314
|
+
result.paths.tokens.c_str(), result.paths.dataDir.c_str());
|
|
234
315
|
LOGI("DetectTtsModel: detection OK for %s", modelDir.c_str());
|
|
235
316
|
return result;
|
|
236
317
|
}
|
|
237
318
|
|
|
319
|
+
// Test-only: used by host-side model_detect_test; not used in production (Android/iOS use DetectTtsModel).
|
|
320
|
+
TtsDetectResult DetectTtsModelFromFileList(
|
|
321
|
+
const std::vector<model_detect::FileEntry>& files,
|
|
322
|
+
const std::string& modelDir,
|
|
323
|
+
const std::string& modelType
|
|
324
|
+
) {
|
|
325
|
+
TtsDetectResult result;
|
|
326
|
+
if (modelDir.empty()) {
|
|
327
|
+
result.error = "TTS: Model directory is empty";
|
|
328
|
+
return result;
|
|
329
|
+
}
|
|
330
|
+
return DetectTtsModelFromFiles(files, modelDir, modelType);
|
|
331
|
+
}
|
|
332
|
+
|
|
238
333
|
} // namespace sherpaonnx
|
|
@@ -2,6 +2,7 @@
|
|
|
2
2
|
#define SHERPA_ONNX_MODEL_DETECT_H
|
|
3
3
|
|
|
4
4
|
#include "sherpa-onnx-common.h"
|
|
5
|
+
#include "sherpa-onnx-model-detect-helper.h"
|
|
5
6
|
#include <optional>
|
|
6
7
|
#include <string>
|
|
7
8
|
#include <vector>
|
|
@@ -21,6 +22,7 @@ enum class SttModelKind {
|
|
|
21
22
|
kFunAsrNano,
|
|
22
23
|
kFireRedAsr,
|
|
23
24
|
kMoonshine,
|
|
25
|
+
kMoonshineV2,
|
|
24
26
|
kDolphin,
|
|
25
27
|
kCanary,
|
|
26
28
|
kOmnilingual,
|
|
@@ -59,6 +61,8 @@ struct SttModelPaths {
|
|
|
59
61
|
std::string moonshineEncoder;
|
|
60
62
|
std::string moonshineUncachedDecoder;
|
|
61
63
|
std::string moonshineCachedDecoder;
|
|
64
|
+
/** Moonshine v2: encoder + mergedDecoder (reuse moonshineEncoder for encoder path). */
|
|
65
|
+
std::string moonshineMergedDecoder;
|
|
62
66
|
// Dolphin, Omnilingual, MedAsr, TeleSpeech (single model each)
|
|
63
67
|
std::string dolphinModel;
|
|
64
68
|
std::string omnilingualModel;
|
|
@@ -71,6 +75,69 @@ struct SttModelPaths {
|
|
|
71
75
|
std::string canaryDecoder;
|
|
72
76
|
};
|
|
73
77
|
|
|
78
|
+
/** All candidate paths gathered before model kind selection (used by STT detection steps). */
|
|
79
|
+
struct SttCandidatePaths {
|
|
80
|
+
std::string encoder;
|
|
81
|
+
std::string decoder;
|
|
82
|
+
std::string joiner;
|
|
83
|
+
std::string paraformerModel;
|
|
84
|
+
std::string ctcModel;
|
|
85
|
+
std::string tokens;
|
|
86
|
+
std::string bpeVocab;
|
|
87
|
+
std::string funasrEncoderAdaptor;
|
|
88
|
+
std::string funasrLLM;
|
|
89
|
+
std::string funasrEmbedding;
|
|
90
|
+
std::string funasrTokenizerDir;
|
|
91
|
+
std::string moonshinePreprocessor;
|
|
92
|
+
std::string moonshineEncoder;
|
|
93
|
+
std::string moonshineUncachedDecoder;
|
|
94
|
+
std::string moonshineCachedDecoder;
|
|
95
|
+
std::string moonshineMergedDecoder;
|
|
96
|
+
std::string encoderForV2;
|
|
97
|
+
};
|
|
98
|
+
|
|
99
|
+
/** Path hints derived from model directory name (isLikely* flags). */
|
|
100
|
+
struct SttPathHints {
|
|
101
|
+
bool isLikelyNemo = false;
|
|
102
|
+
bool isLikelyTdt = false;
|
|
103
|
+
bool isLikelyWenetCtc = false;
|
|
104
|
+
bool isLikelySenseVoice = false;
|
|
105
|
+
bool isLikelyFunAsrNano = false;
|
|
106
|
+
bool isLikelyZipformer = false;
|
|
107
|
+
bool isLikelyMoonshine = false;
|
|
108
|
+
bool isLikelyDolphin = false;
|
|
109
|
+
bool isLikelyFireRedAsr = false;
|
|
110
|
+
bool isLikelyCanary = false;
|
|
111
|
+
bool isLikelyOmnilingual = false;
|
|
112
|
+
bool isLikelyMedAsr = false;
|
|
113
|
+
bool isLikelyTeleSpeech = false;
|
|
114
|
+
bool isLikelyToneCtc = false;
|
|
115
|
+
bool isLikelyParaformer = false;
|
|
116
|
+
/** VAD (silero, ten-vad, etc.): not yet supported; when true, detection returns unsupported. */
|
|
117
|
+
bool isLikelyVad = false;
|
|
118
|
+
/** TDNN (keyword/yesno): not yet supported; when true, detection returns unsupported. */
|
|
119
|
+
bool isLikelyTdnn = false;
|
|
120
|
+
};
|
|
121
|
+
|
|
122
|
+
/** Which model types are possible given paths and hints (has* flags). */
|
|
123
|
+
struct SttCapabilities {
|
|
124
|
+
bool hasTransducer = false;
|
|
125
|
+
bool hasWhisper = false;
|
|
126
|
+
bool hasMoonshine = false;
|
|
127
|
+
bool hasMoonshineV2 = false;
|
|
128
|
+
bool hasParaformer = false;
|
|
129
|
+
bool hasFunAsrNano = false;
|
|
130
|
+
bool hasDolphin = false;
|
|
131
|
+
bool hasFireRedAsr = false;
|
|
132
|
+
/** True when dir name suggests Fire Red but only a single CTC/paraformer model (no encoder/decoder). Use zipformer_ctc. */
|
|
133
|
+
bool hasFireRedCtc = false;
|
|
134
|
+
bool hasCanary = false;
|
|
135
|
+
bool hasOmnilingual = false;
|
|
136
|
+
bool hasMedAsr = false;
|
|
137
|
+
bool hasTeleSpeechCtc = false;
|
|
138
|
+
bool hasToneCtc = false;
|
|
139
|
+
};
|
|
140
|
+
|
|
74
141
|
struct TtsModelPaths {
|
|
75
142
|
std::string ttsModel;
|
|
76
143
|
std::string tokens;
|
|
@@ -92,6 +159,8 @@ struct TtsModelPaths {
|
|
|
92
159
|
struct SttDetectResult {
|
|
93
160
|
bool ok = false;
|
|
94
161
|
std::string error;
|
|
162
|
+
/** True when detection failed because the model is for unsupported hardware (RK35xx, Ascend, CANN, etc.). */
|
|
163
|
+
bool isHardwareSpecificUnsupported = false;
|
|
95
164
|
std::vector<DetectedModel> detectedModels;
|
|
96
165
|
SttModelKind selectedKind = SttModelKind::kUnknown;
|
|
97
166
|
bool tokensRequired = true;
|
|
@@ -113,11 +182,32 @@ SttDetectResult DetectSttModel(
|
|
|
113
182
|
bool debug = false
|
|
114
183
|
);
|
|
115
184
|
|
|
185
|
+
/** Test-only: Like DetectSttModel but takes a pre-built file list; no filesystem access.
|
|
186
|
+
* Only used by the host-side C++ test suite (test/cpp/model_detect_test.cpp). Not used in
|
|
187
|
+
* production (Android/iOS use DetectSttModel). Does not validate modelDir existence or
|
|
188
|
+
* call FileExists on tokens/bpeVocab. */
|
|
189
|
+
SttDetectResult DetectSttModelFromFileList(
|
|
190
|
+
const std::vector<model_detect::FileEntry>& files,
|
|
191
|
+
const std::string& modelDir,
|
|
192
|
+
const std::optional<bool>& preferInt8 = std::nullopt,
|
|
193
|
+
const std::optional<std::string>& modelType = std::nullopt
|
|
194
|
+
);
|
|
195
|
+
|
|
116
196
|
TtsDetectResult DetectTtsModel(
|
|
117
197
|
const std::string& modelDir,
|
|
118
198
|
const std::string& modelType
|
|
119
199
|
);
|
|
120
200
|
|
|
201
|
+
/** Test-only: Like DetectTtsModel but takes a pre-built file list; no filesystem access.
|
|
202
|
+
* Only used by the host-side C++ test suite (test/cpp/model_detect_test.cpp). Not used in
|
|
203
|
+
* production (Android/iOS use DetectTtsModel). Does not validate modelDir existence or
|
|
204
|
+
* call FileExists / IsDirectory. */
|
|
205
|
+
TtsDetectResult DetectTtsModelFromFileList(
|
|
206
|
+
const std::vector<model_detect::FileEntry>& files,
|
|
207
|
+
const std::string& modelDir,
|
|
208
|
+
const std::string& modelType = "auto"
|
|
209
|
+
);
|
|
210
|
+
|
|
121
211
|
} // namespace sherpaonnx
|
|
122
212
|
|
|
123
213
|
#endif // SHERPA_ONNX_MODEL_DETECT_H
|
|
@@ -25,6 +25,7 @@ const char* SttModelKindToString(SttModelKind k) {
|
|
|
25
25
|
case SttModelKind::kFunAsrNano: return "funasr_nano";
|
|
26
26
|
case SttModelKind::kFireRedAsr: return "fire_red_asr";
|
|
27
27
|
case SttModelKind::kMoonshine: return "moonshine";
|
|
28
|
+
case SttModelKind::kMoonshineV2: return "moonshine_v2";
|
|
28
29
|
case SttModelKind::kDolphin: return "dolphin";
|
|
29
30
|
case SttModelKind::kCanary: return "canary";
|
|
30
31
|
case SttModelKind::kOmnilingual: return "omnilingual";
|
|
@@ -52,6 +53,7 @@ jobject SttDetectResultToJava(JNIEnv* env, const SttDetectResult& result) {
|
|
|
52
53
|
|
|
53
54
|
PutBoolean(env, map, mapPut, "success", result.ok);
|
|
54
55
|
PutString(env, map, mapPut, "error", result.error);
|
|
56
|
+
PutBoolean(env, map, mapPut, "isHardwareSpecificUnsupported", result.isHardwareSpecificUnsupported);
|
|
55
57
|
PutString(env, map, mapPut, "modelType", SttModelKindToString(result.selectedKind));
|
|
56
58
|
|
|
57
59
|
jobject detectedList = BuildDetectedModelsList(env, result.detectedModels);
|
|
@@ -81,6 +83,7 @@ jobject SttDetectResultToJava(JNIEnv* env, const SttDetectResult& result) {
|
|
|
81
83
|
PutString(env, pathsMap, mapPut, "moonshineEncoder", result.paths.moonshineEncoder);
|
|
82
84
|
PutString(env, pathsMap, mapPut, "moonshineUncachedDecoder", result.paths.moonshineUncachedDecoder);
|
|
83
85
|
PutString(env, pathsMap, mapPut, "moonshineCachedDecoder", result.paths.moonshineCachedDecoder);
|
|
86
|
+
PutString(env, pathsMap, mapPut, "moonshineMergedDecoder", result.paths.moonshineMergedDecoder);
|
|
84
87
|
PutString(env, pathsMap, mapPut, "dolphinModel", result.paths.dolphinModel);
|
|
85
88
|
PutString(env, pathsMap, mapPut, "omnilingualModel", result.paths.omnilingualModel);
|
|
86
89
|
PutString(env, pathsMap, mapPut, "medasrModel", result.paths.medasrModel);
|
|
@@ -55,6 +55,7 @@ class SherpaOnnxModule(reactContext: ReactApplicationContext) :
|
|
|
55
55
|
{ instanceId, requestId, cancelled -> emitTtsStreamEnd(instanceId, requestId, cancelled) }
|
|
56
56
|
)
|
|
57
57
|
private val archiveHelper = SherpaOnnxArchiveHelper()
|
|
58
|
+
private var pcmCapture: SherpaOnnxPcmCapture? = null
|
|
58
59
|
|
|
59
60
|
override fun getName(): String {
|
|
60
61
|
return NAME
|
|
@@ -62,6 +63,8 @@ class SherpaOnnxModule(reactContext: ReactApplicationContext) :
|
|
|
62
63
|
|
|
63
64
|
override fun onCatalystInstanceDestroy() {
|
|
64
65
|
super.onCatalystInstanceDestroy()
|
|
66
|
+
pcmCapture?.stop()
|
|
67
|
+
pcmCapture = null
|
|
65
68
|
onlineSttHelper.shutdown()
|
|
66
69
|
ttsHelper.shutdown()
|
|
67
70
|
}
|
|
@@ -319,12 +322,14 @@ class SherpaOnnxModule(reactContext: ReactApplicationContext) :
|
|
|
319
322
|
return
|
|
320
323
|
}
|
|
321
324
|
val success = result["success"] as? Boolean ?: false
|
|
325
|
+
val isHardwareSpecificUnsupported = result["isHardwareSpecificUnsupported"] as? Boolean ?: false
|
|
322
326
|
val detectedModels = result["detectedModels"] as? ArrayList<*>
|
|
323
327
|
?: arrayListOf<HashMap<String, String>>()
|
|
324
328
|
val modelTypeStr = result["modelType"] as? String
|
|
325
329
|
|
|
326
330
|
val resultMap = Arguments.createMap()
|
|
327
331
|
resultMap.putBoolean("success", success)
|
|
332
|
+
resultMap.putBoolean("isHardwareSpecificUnsupported", isHardwareSpecificUnsupported)
|
|
328
333
|
val modelsArray = Arguments.createArray()
|
|
329
334
|
for (model in detectedModels) {
|
|
330
335
|
val modelMap = model as? HashMap<*, *>
|
|
@@ -484,6 +489,71 @@ class SherpaOnnxModule(reactContext: ReactApplicationContext) :
|
|
|
484
489
|
onlineSttHelper.processSttAudioChunk(streamId, samples, sampleRate.toInt(), promise)
|
|
485
490
|
}
|
|
486
491
|
|
|
492
|
+
override fun startPcmLiveStream(options: ReadableMap, promise: Promise) {
|
|
493
|
+
try {
|
|
494
|
+
pcmCapture?.stop()
|
|
495
|
+
pcmCapture = null
|
|
496
|
+
val sampleRate = options.getDouble("sampleRate").toInt().takeIf { it > 0 } ?: 16000
|
|
497
|
+
val channelCount = if (options.hasKey("channelCount")) options.getDouble("channelCount").toInt().coerceIn(1, 2) else 1
|
|
498
|
+
val bufferSizeFrames = if (options.hasKey("bufferSizeFrames")) options.getDouble("bufferSizeFrames").toInt() else 0
|
|
499
|
+
var startError: String? = null
|
|
500
|
+
var started = false
|
|
501
|
+
val capture = SherpaOnnxPcmCapture(
|
|
502
|
+
targetSampleRate = sampleRate,
|
|
503
|
+
channelCount = channelCount,
|
|
504
|
+
bufferSizeFrames = bufferSizeFrames,
|
|
505
|
+
onChunk = { base64Pcm, sr -> emitPcmLiveStreamData(base64Pcm, sr) },
|
|
506
|
+
onError = { msg ->
|
|
507
|
+
if (!started) {
|
|
508
|
+
startError = msg
|
|
509
|
+
} else {
|
|
510
|
+
emitPcmLiveStreamError(msg)
|
|
511
|
+
}
|
|
512
|
+
},
|
|
513
|
+
logTag = NAME
|
|
514
|
+
)
|
|
515
|
+
pcmCapture = capture
|
|
516
|
+
capture.start()
|
|
517
|
+
started = true
|
|
518
|
+
val err = startError
|
|
519
|
+
if (err != null) {
|
|
520
|
+
promise.reject("PCM_LIVE_STREAM_ERROR", err)
|
|
521
|
+
} else {
|
|
522
|
+
promise.resolve(null)
|
|
523
|
+
}
|
|
524
|
+
} catch (e: Exception) {
|
|
525
|
+
android.util.Log.e(NAME, "startPcmLiveStream failed", e)
|
|
526
|
+
promise.reject("PCM_LIVE_STREAM_ERROR", e.message ?: "Failed to start PCM capture", e)
|
|
527
|
+
}
|
|
528
|
+
}
|
|
529
|
+
|
|
530
|
+
override fun stopPcmLiveStream(promise: Promise) {
|
|
531
|
+
try {
|
|
532
|
+
pcmCapture?.stop()
|
|
533
|
+
pcmCapture = null
|
|
534
|
+
promise.resolve(null)
|
|
535
|
+
} catch (e: Exception) {
|
|
536
|
+
promise.reject("PCM_LIVE_STREAM_ERROR", e.message ?: "Failed to stop PCM capture", e)
|
|
537
|
+
}
|
|
538
|
+
}
|
|
539
|
+
|
|
540
|
+
private fun emitPcmLiveStreamData(base64Pcm: String, sampleRate: Int) {
|
|
541
|
+
val eventEmitter = reactApplicationContext
|
|
542
|
+
.getJSModule(DeviceEventManagerModule.RCTDeviceEventEmitter::class.java)
|
|
543
|
+
val payload = Arguments.createMap()
|
|
544
|
+
payload.putString("base64Pcm", base64Pcm)
|
|
545
|
+
payload.putInt("sampleRate", sampleRate)
|
|
546
|
+
eventEmitter.emit("pcmLiveStreamData", payload)
|
|
547
|
+
}
|
|
548
|
+
|
|
549
|
+
private fun emitPcmLiveStreamError(message: String) {
|
|
550
|
+
val eventEmitter = reactApplicationContext
|
|
551
|
+
.getJSModule(DeviceEventManagerModule.RCTDeviceEventEmitter::class.java)
|
|
552
|
+
val payload = Arguments.createMap()
|
|
553
|
+
payload.putString("message", message)
|
|
554
|
+
eventEmitter.emit("pcmLiveStreamError", payload)
|
|
555
|
+
}
|
|
556
|
+
|
|
487
557
|
// ==================== STT Methods ====================
|
|
488
558
|
|
|
489
559
|
/**
|