react-native-sherpa-onnx 0.2.0 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +232 -236
- package/SherpaOnnx.podspec +68 -64
- package/android/build.gradle +182 -192
- package/android/codegen.gradle +57 -0
- package/android/prebuilt-download.gradle +428 -0
- package/android/prebuilt-versions.gradle +43 -0
- package/android/proguard-rules.pro +10 -0
- package/android/src/main/assets/testModels/add_mul_add.onnx +28 -0
- package/android/src/main/assets/testModels/nnapi_internal_uint8_support.onnx +0 -0
- package/android/src/main/assets/testModels/qnn_multi_ctx_embed.onnx +0 -0
- package/android/src/main/cpp/CMakeLists.txt +166 -129
- package/android/src/main/cpp/CMakePresets.json +54 -0
- package/android/src/main/cpp/crypto/sha256.cpp +174 -0
- package/android/src/main/cpp/crypto/sha256.h +16 -0
- package/android/src/main/cpp/jni/archive/sherpa-onnx-archive-helper.cpp +404 -0
- package/android/src/main/cpp/jni/archive/sherpa-onnx-archive-helper.h +56 -0
- package/android/src/main/cpp/jni/archive/sherpa-onnx-archive-jni.cpp +181 -0
- package/android/src/main/cpp/jni/audio/sherpa-onnx-audio-convert-jni.cpp +888 -0
- package/{ios → android/src/main/cpp/jni/model_detect}/sherpa-onnx-common.h +18 -18
- package/android/src/main/cpp/jni/model_detect/sherpa-onnx-detect-jni-common.cpp +86 -0
- package/android/src/main/cpp/jni/model_detect/sherpa-onnx-detect-jni-common.h +20 -0
- package/android/src/main/cpp/jni/model_detect/sherpa-onnx-model-detect-helper.cpp +423 -0
- package/android/src/main/cpp/jni/model_detect/sherpa-onnx-model-detect-helper.h +55 -0
- package/android/src/main/cpp/jni/model_detect/sherpa-onnx-model-detect-stt.cpp +399 -0
- package/android/src/main/cpp/jni/model_detect/sherpa-onnx-model-detect-tts.cpp +238 -0
- package/{ios → android/src/main/cpp/jni/model_detect}/sherpa-onnx-model-detect.h +122 -89
- package/android/src/main/cpp/jni/model_detect/sherpa-onnx-stt-wrapper.cpp +99 -0
- package/android/src/main/cpp/jni/model_detect/sherpa-onnx-stt-wrapper.h +16 -0
- package/android/src/main/cpp/jni/model_detect/sherpa-onnx-tts-wrapper.cpp +78 -0
- package/android/src/main/cpp/jni/model_detect/sherpa-onnx-tts-wrapper.h +16 -0
- package/android/src/main/cpp/jni/module/sherpa-onnx-module-jni.cpp +190 -0
- package/android/src/main/cpp/jni/tts/sherpa-onnx-tts-zipvoice-jni.cpp +301 -0
- package/android/src/main/java/com/sherpaonnx/SherpaOnnxArchiveHelper.kt +94 -0
- package/android/src/main/java/com/sherpaonnx/{SherpaOnnxCoreHelper.kt → SherpaOnnxAssetHelper.kt} +350 -236
- package/android/src/main/java/com/sherpaonnx/SherpaOnnxModule.kt +791 -483
- package/android/src/main/java/com/sherpaonnx/SherpaOnnxSttHelper.kt +699 -109
- package/android/src/main/java/com/sherpaonnx/SherpaOnnxTtsHelper.kt +1123 -668
- package/android/src/main/java/com/sherpaonnx/ZipvoiceTtsWrapper.kt +187 -0
- package/ios/SherpaOnnx+Assets.h +11 -0
- package/ios/SherpaOnnx+Assets.mm +325 -0
- package/ios/SherpaOnnx+STT.mm +455 -118
- package/ios/SherpaOnnx+TTS.mm +1101 -712
- package/ios/SherpaOnnx.h +17 -6
- package/ios/SherpaOnnx.mm +206 -311
- package/ios/SherpaOnnx.xcconfig +19 -19
- package/ios/SherpaOnnxCoreMLHelper.swift +24 -0
- package/ios/archive/sherpa-onnx-archive-helper.h +21 -0
- package/ios/archive/sherpa-onnx-archive-helper.mm +296 -0
- package/ios/libarchive_darwin_config.h +153 -0
- package/{android/src/main/cpp/jni → ios/model_detect}/sherpa-onnx-common.h +18 -18
- package/ios/model_detect/sherpa-onnx-model-detect-helper.h +49 -0
- package/ios/model_detect/sherpa-onnx-model-detect-helper.mm +210 -0
- package/ios/model_detect/sherpa-onnx-model-detect-stt.mm +344 -0
- package/ios/model_detect/sherpa-onnx-model-detect-tts.mm +201 -0
- package/{android/src/main/cpp/jni → ios/model_detect}/sherpa-onnx-model-detect.h +117 -89
- package/ios/scripts/patch-libarchive-includes.sh +61 -0
- package/ios/scripts/setup-ios-libarchive.sh +98 -0
- package/ios/stt/sherpa-onnx-stt-wrapper.h +129 -0
- package/ios/stt/sherpa-onnx-stt-wrapper.mm +523 -0
- package/ios/{sherpa-onnx-tts-wrapper.h → tts/sherpa-onnx-tts-wrapper.h} +90 -85
- package/ios/{sherpa-onnx-tts-wrapper.mm → tts/sherpa-onnx-tts-wrapper.mm} +376 -345
- package/lib/module/NativeSherpaOnnx.js +3 -0
- package/lib/module/NativeSherpaOnnx.js.map +1 -1
- package/lib/module/audio/index.js +22 -0
- package/lib/module/audio/index.js.map +1 -0
- package/lib/module/diarization/index.js +1 -1
- package/lib/module/diarization/index.js.map +1 -1
- package/lib/module/download/ModelDownloadManager.js +918 -0
- package/lib/module/download/ModelDownloadManager.js.map +1 -0
- package/lib/module/download/extractTarBz2.js +53 -0
- package/lib/module/download/extractTarBz2.js.map +1 -0
- package/lib/module/download/index.js +6 -0
- package/lib/module/download/index.js.map +1 -0
- package/lib/module/download/validation.js +178 -0
- package/lib/module/download/validation.js.map +1 -0
- package/lib/module/enhancement/index.js +1 -1
- package/lib/module/enhancement/index.js.map +1 -1
- package/lib/module/index.js +41 -3
- package/lib/module/index.js.map +1 -1
- package/lib/module/separation/index.js +1 -1
- package/lib/module/separation/index.js.map +1 -1
- package/lib/module/stt/index.js +127 -60
- package/lib/module/stt/index.js.map +1 -1
- package/lib/module/stt/sttModelLanguages.js +512 -0
- package/lib/module/stt/sttModelLanguages.js.map +1 -0
- package/lib/module/stt/types.js +53 -1
- package/lib/module/stt/types.js.map +1 -1
- package/lib/module/tts/index.js +216 -289
- package/lib/module/tts/index.js.map +1 -1
- package/lib/module/tts/types.js +86 -1
- package/lib/module/tts/types.js.map +1 -1
- package/lib/module/types.js.map +1 -1
- package/lib/module/utils.js +86 -73
- package/lib/module/utils.js.map +1 -1
- package/lib/module/vad/index.js +1 -1
- package/lib/module/vad/index.js.map +1 -1
- package/lib/typescript/src/NativeSherpaOnnx.d.ts +192 -38
- package/lib/typescript/src/NativeSherpaOnnx.d.ts.map +1 -1
- package/lib/typescript/src/audio/index.d.ts +13 -0
- package/lib/typescript/src/audio/index.d.ts.map +1 -0
- package/lib/typescript/src/diarization/index.d.ts +3 -2
- package/lib/typescript/src/diarization/index.d.ts.map +1 -1
- package/lib/typescript/src/download/ModelDownloadManager.d.ts +108 -0
- package/lib/typescript/src/download/ModelDownloadManager.d.ts.map +1 -0
- package/lib/typescript/src/download/extractTarBz2.d.ts +14 -0
- package/lib/typescript/src/download/extractTarBz2.d.ts.map +1 -0
- package/lib/typescript/src/download/index.d.ts +7 -0
- package/lib/typescript/src/download/index.d.ts.map +1 -0
- package/lib/typescript/src/download/validation.d.ts +57 -0
- package/lib/typescript/src/download/validation.d.ts.map +1 -0
- package/lib/typescript/src/enhancement/index.d.ts +3 -2
- package/lib/typescript/src/enhancement/index.d.ts.map +1 -1
- package/lib/typescript/src/index.d.ts +26 -2
- package/lib/typescript/src/index.d.ts.map +1 -1
- package/lib/typescript/src/separation/index.d.ts +3 -2
- package/lib/typescript/src/separation/index.d.ts.map +1 -1
- package/lib/typescript/src/stt/index.d.ts +31 -43
- package/lib/typescript/src/stt/index.d.ts.map +1 -1
- package/lib/typescript/src/stt/sttModelLanguages.d.ts +52 -0
- package/lib/typescript/src/stt/sttModelLanguages.d.ts.map +1 -0
- package/lib/typescript/src/stt/types.d.ts +196 -9
- package/lib/typescript/src/stt/types.d.ts.map +1 -1
- package/lib/typescript/src/tts/index.d.ts +25 -211
- package/lib/typescript/src/tts/index.d.ts.map +1 -1
- package/lib/typescript/src/tts/types.d.ts +148 -25
- package/lib/typescript/src/tts/types.d.ts.map +1 -1
- package/lib/typescript/src/types.d.ts +0 -32
- package/lib/typescript/src/types.d.ts.map +1 -1
- package/lib/typescript/src/utils.d.ts +28 -13
- package/lib/typescript/src/utils.d.ts.map +1 -1
- package/lib/typescript/src/vad/index.d.ts +3 -2
- package/lib/typescript/src/vad/index.d.ts.map +1 -1
- package/package.json +250 -222
- package/scripts/check-qnn-support.sh +78 -0
- package/scripts/setup-ios-framework.sh +379 -282
- package/src/NativeSherpaOnnx.ts +474 -251
- package/src/audio/index.ts +32 -0
- package/src/diarization/index.ts +4 -2
- package/src/download/ModelDownloadManager.ts +1325 -0
- package/src/download/extractTarBz2.ts +78 -0
- package/src/download/index.ts +43 -0
- package/src/download/validation.ts +279 -0
- package/src/enhancement/index.ts +4 -2
- package/src/index.tsx +78 -27
- package/src/separation/index.ts +4 -2
- package/src/stt/index.ts +249 -89
- package/src/stt/sttModelLanguages.ts +237 -0
- package/src/stt/types.ts +263 -9
- package/src/tts/index.ts +470 -458
- package/src/tts/types.ts +373 -218
- package/src/types.ts +0 -44
- package/src/utils.ts +145 -131
- package/src/vad/index.ts +4 -2
- package/third_party/ffmpeg_prebuilt/ANDROID_RELEASE_TAG +1 -0
- package/third_party/libarchive_prebuilt/ANDROID_RELEASE_TAG +1 -0
- package/third_party/libarchive_prebuilt/IOS_RELEASE_TAG +1 -0
- package/third_party/sherpa-onnx-prebuilt/ANDROID_RELEASE_TAG +1 -0
- package/third_party/sherpa-onnx-prebuilt/IOS_RELEASE_TAG +1 -0
- package/android/src/main/cpp/include/sherpa-onnx/c-api/c-api.h +0 -1918
- package/android/src/main/cpp/include/sherpa-onnx/c-api/cxx-api.h +0 -841
- package/android/src/main/cpp/jni/sherpa-onnx-model-detect.cpp +0 -541
- package/android/src/main/cpp/jni/sherpa-onnx-stt-jni.cpp +0 -336
- package/android/src/main/cpp/jni/sherpa-onnx-stt-wrapper.cpp +0 -222
- package/android/src/main/cpp/jni/sherpa-onnx-stt-wrapper.h +0 -68
- package/android/src/main/cpp/jni/sherpa-onnx-tts-jni.cpp +0 -823
- package/android/src/main/cpp/jni/sherpa-onnx-tts-wrapper.cpp +0 -387
- package/android/src/main/cpp/jni/sherpa-onnx-tts-wrapper.h +0 -147
- package/ios/Frameworks/sherpa_onnx.xcframework.zip +0 -0
- package/ios/include/sherpa-onnx/c-api/c-api.h +0 -1918
- package/ios/include/sherpa-onnx/c-api/cxx-api.h +0 -841
- package/ios/sherpa-onnx-model-detect.mm +0 -441
- package/ios/sherpa-onnx-stt-wrapper.h +0 -48
- package/ios/sherpa-onnx-stt-wrapper.mm +0 -201
- package/scripts/copy-headers.js +0 -184
- package/scripts/setup-assets.js +0 -323
|
@@ -1,541 +0,0 @@
|
|
|
1
|
-
#include "sherpa-onnx-model-detect.h"
|
|
2
|
-
|
|
3
|
-
#include <algorithm>
|
|
4
|
-
#include <cctype>
|
|
5
|
-
#include <string>
|
|
6
|
-
#include <vector>
|
|
7
|
-
|
|
8
|
-
#if __cplusplus >= 201703L && __has_include(<filesystem>)
|
|
9
|
-
#include <filesystem>
|
|
10
|
-
namespace fs = std::filesystem;
|
|
11
|
-
#elif __has_include(<experimental/filesystem>)
|
|
12
|
-
#include <experimental/filesystem>
|
|
13
|
-
namespace fs = std::experimental::filesystem;
|
|
14
|
-
#else
|
|
15
|
-
#include <dirent.h>
|
|
16
|
-
#include <sys/stat.h>
|
|
17
|
-
#endif
|
|
18
|
-
|
|
19
|
-
namespace sherpaonnx {
|
|
20
|
-
namespace {
|
|
21
|
-
|
|
22
|
-
bool FileExists(const std::string& path) {
|
|
23
|
-
#if __cplusplus >= 201703L && __has_include(<filesystem>)
|
|
24
|
-
return std::filesystem::exists(path);
|
|
25
|
-
#elif __has_include(<experimental/filesystem>)
|
|
26
|
-
return std::experimental::filesystem::exists(path);
|
|
27
|
-
#else
|
|
28
|
-
struct stat buffer;
|
|
29
|
-
return (stat(path.c_str(), &buffer) == 0);
|
|
30
|
-
#endif
|
|
31
|
-
}
|
|
32
|
-
|
|
33
|
-
bool IsDirectory(const std::string& path) {
|
|
34
|
-
#if __cplusplus >= 201703L && __has_include(<filesystem>)
|
|
35
|
-
return std::filesystem::is_directory(path);
|
|
36
|
-
#elif __has_include(<experimental/filesystem>)
|
|
37
|
-
return std::experimental::filesystem::is_directory(path);
|
|
38
|
-
#else
|
|
39
|
-
struct stat buffer;
|
|
40
|
-
if (stat(path.c_str(), &buffer) != 0) return false;
|
|
41
|
-
return S_ISDIR(buffer.st_mode);
|
|
42
|
-
#endif
|
|
43
|
-
}
|
|
44
|
-
|
|
45
|
-
std::vector<std::string> ListDirectories(const std::string& path) {
|
|
46
|
-
std::vector<std::string> results;
|
|
47
|
-
#if __cplusplus >= 201703L && __has_include(<filesystem>)
|
|
48
|
-
try {
|
|
49
|
-
for (const auto& entry : fs::directory_iterator(path)) {
|
|
50
|
-
if (entry.is_directory()) {
|
|
51
|
-
results.push_back(entry.path().string());
|
|
52
|
-
}
|
|
53
|
-
}
|
|
54
|
-
} catch (const std::exception&) {
|
|
55
|
-
}
|
|
56
|
-
#elif __has_include(<experimental/filesystem>)
|
|
57
|
-
try {
|
|
58
|
-
for (const auto& entry : fs::directory_iterator(path)) {
|
|
59
|
-
if (entry.is_directory()) {
|
|
60
|
-
results.push_back(entry.path().string());
|
|
61
|
-
}
|
|
62
|
-
}
|
|
63
|
-
} catch (const std::exception&) {
|
|
64
|
-
}
|
|
65
|
-
#else
|
|
66
|
-
DIR* dir = opendir(path.c_str());
|
|
67
|
-
if (!dir) return results;
|
|
68
|
-
while (auto* entry = readdir(dir)) {
|
|
69
|
-
if (!entry->d_name) continue;
|
|
70
|
-
std::string name = entry->d_name;
|
|
71
|
-
if (name == "." || name == "..") continue;
|
|
72
|
-
std::string full = path + "/" + name;
|
|
73
|
-
struct stat st;
|
|
74
|
-
if (stat(full.c_str(), &st) == 0 && S_ISDIR(st.st_mode)) {
|
|
75
|
-
results.push_back(full);
|
|
76
|
-
}
|
|
77
|
-
}
|
|
78
|
-
closedir(dir);
|
|
79
|
-
#endif
|
|
80
|
-
return results;
|
|
81
|
-
}
|
|
82
|
-
|
|
83
|
-
std::string ToLower(std::string value) {
|
|
84
|
-
std::transform(value.begin(), value.end(), value.begin(), [](unsigned char c) {
|
|
85
|
-
return static_cast<char>(std::tolower(c));
|
|
86
|
-
});
|
|
87
|
-
return value;
|
|
88
|
-
}
|
|
89
|
-
|
|
90
|
-
std::string ResolveTokenizerDir(const std::string& modelDir) {
|
|
91
|
-
std::string vocabInMain = modelDir + "/vocab.json";
|
|
92
|
-
if (FileExists(vocabInMain)) {
|
|
93
|
-
return modelDir;
|
|
94
|
-
}
|
|
95
|
-
|
|
96
|
-
for (const auto& dir : ListDirectories(modelDir)) {
|
|
97
|
-
std::string dirName = dir;
|
|
98
|
-
#if __cplusplus >= 201703L && __has_include(<filesystem>)
|
|
99
|
-
try {
|
|
100
|
-
dirName = fs::path(dir).filename().string();
|
|
101
|
-
} catch (const std::exception&) {
|
|
102
|
-
}
|
|
103
|
-
#elif __has_include(<experimental/filesystem>)
|
|
104
|
-
try {
|
|
105
|
-
dirName = fs::path(dir).filename().string();
|
|
106
|
-
} catch (const std::exception&) {
|
|
107
|
-
}
|
|
108
|
-
#else
|
|
109
|
-
// best effort: use full path if we cannot parse the filename
|
|
110
|
-
#endif
|
|
111
|
-
std::string dirNameLower = ToLower(dirName);
|
|
112
|
-
if (dirNameLower.find("qwen3") != std::string::npos) {
|
|
113
|
-
std::string vocabPath = dir + "/vocab.json";
|
|
114
|
-
if (FileExists(vocabPath)) {
|
|
115
|
-
return dir;
|
|
116
|
-
}
|
|
117
|
-
}
|
|
118
|
-
}
|
|
119
|
-
|
|
120
|
-
std::string commonPath = modelDir + "/Qwen3-0.6B";
|
|
121
|
-
if (FileExists(commonPath + "/vocab.json")) {
|
|
122
|
-
return commonPath;
|
|
123
|
-
}
|
|
124
|
-
|
|
125
|
-
return "";
|
|
126
|
-
}
|
|
127
|
-
|
|
128
|
-
SttModelKind ParseSttModelType(const std::string& modelType) {
|
|
129
|
-
if (modelType == "transducer") return SttModelKind::kTransducer;
|
|
130
|
-
if (modelType == "paraformer") return SttModelKind::kParaformer;
|
|
131
|
-
if (modelType == "nemo_ctc") return SttModelKind::kNemoCtc;
|
|
132
|
-
if (modelType == "wenet_ctc") return SttModelKind::kWenetCtc;
|
|
133
|
-
if (modelType == "sense_voice") return SttModelKind::kSenseVoice;
|
|
134
|
-
if (modelType == "zipformer_ctc" || modelType == "ctc") return SttModelKind::kZipformerCtc;
|
|
135
|
-
if (modelType == "whisper") return SttModelKind::kWhisper;
|
|
136
|
-
if (modelType == "funasr_nano") return SttModelKind::kFunAsrNano;
|
|
137
|
-
return SttModelKind::kUnknown;
|
|
138
|
-
}
|
|
139
|
-
|
|
140
|
-
TtsModelKind ParseTtsModelType(const std::string& modelType) {
|
|
141
|
-
if (modelType == "vits") return TtsModelKind::kVits;
|
|
142
|
-
if (modelType == "matcha") return TtsModelKind::kMatcha;
|
|
143
|
-
if (modelType == "kokoro") return TtsModelKind::kKokoro;
|
|
144
|
-
if (modelType == "kitten") return TtsModelKind::kKitten;
|
|
145
|
-
if (modelType == "zipvoice") return TtsModelKind::kZipvoice;
|
|
146
|
-
return TtsModelKind::kUnknown;
|
|
147
|
-
}
|
|
148
|
-
|
|
149
|
-
} // namespace
|
|
150
|
-
|
|
151
|
-
SttDetectResult DetectSttModel(
|
|
152
|
-
const std::string& modelDir,
|
|
153
|
-
const std::optional<bool>& preferInt8,
|
|
154
|
-
const std::optional<std::string>& modelType
|
|
155
|
-
) {
|
|
156
|
-
SttDetectResult result;
|
|
157
|
-
|
|
158
|
-
if (modelDir.empty()) {
|
|
159
|
-
result.error = "Model directory is empty";
|
|
160
|
-
return result;
|
|
161
|
-
}
|
|
162
|
-
|
|
163
|
-
if (!FileExists(modelDir) || !IsDirectory(modelDir)) {
|
|
164
|
-
result.error = "Model directory does not exist or is not a directory: " + modelDir;
|
|
165
|
-
return result;
|
|
166
|
-
}
|
|
167
|
-
|
|
168
|
-
std::string encoderPath = modelDir + "/encoder.onnx";
|
|
169
|
-
std::string decoderPath = modelDir + "/decoder.onnx";
|
|
170
|
-
std::string joinerPath = modelDir + "/joiner.onnx";
|
|
171
|
-
std::string encoderPathInt8 = modelDir + "/encoder.int8.onnx";
|
|
172
|
-
std::string decoderPathInt8 = modelDir + "/decoder.int8.onnx";
|
|
173
|
-
std::string paraformerPathInt8 = modelDir + "/model.int8.onnx";
|
|
174
|
-
std::string paraformerPath = modelDir + "/model.onnx";
|
|
175
|
-
std::string ctcPathInt8 = modelDir + "/model.int8.onnx";
|
|
176
|
-
std::string ctcPath = modelDir + "/model.onnx";
|
|
177
|
-
std::string tokensPath = modelDir + "/tokens.txt";
|
|
178
|
-
|
|
179
|
-
std::string funasrEncoderAdaptor = modelDir + "/encoder_adaptor.onnx";
|
|
180
|
-
std::string funasrEncoderAdaptorInt8 = modelDir + "/encoder_adaptor.int8.onnx";
|
|
181
|
-
std::string funasrLLM = modelDir + "/llm.onnx";
|
|
182
|
-
std::string funasrLLMInt8 = modelDir + "/llm.int8.onnx";
|
|
183
|
-
std::string funasrEmbedding = modelDir + "/embedding.onnx";
|
|
184
|
-
std::string funasrEmbeddingInt8 = modelDir + "/embedding.int8.onnx";
|
|
185
|
-
|
|
186
|
-
std::string funasrTokenizerDir = ResolveTokenizerDir(modelDir);
|
|
187
|
-
|
|
188
|
-
std::string paraformerModelPath;
|
|
189
|
-
if (preferInt8.has_value()) {
|
|
190
|
-
if (preferInt8.value()) {
|
|
191
|
-
if (FileExists(paraformerPathInt8)) {
|
|
192
|
-
paraformerModelPath = paraformerPathInt8;
|
|
193
|
-
} else if (FileExists(paraformerPath)) {
|
|
194
|
-
paraformerModelPath = paraformerPath;
|
|
195
|
-
}
|
|
196
|
-
} else {
|
|
197
|
-
if (FileExists(paraformerPath)) {
|
|
198
|
-
paraformerModelPath = paraformerPath;
|
|
199
|
-
} else if (FileExists(paraformerPathInt8)) {
|
|
200
|
-
paraformerModelPath = paraformerPathInt8;
|
|
201
|
-
}
|
|
202
|
-
}
|
|
203
|
-
} else {
|
|
204
|
-
if (FileExists(paraformerPathInt8)) {
|
|
205
|
-
paraformerModelPath = paraformerPathInt8;
|
|
206
|
-
} else if (FileExists(paraformerPath)) {
|
|
207
|
-
paraformerModelPath = paraformerPath;
|
|
208
|
-
}
|
|
209
|
-
}
|
|
210
|
-
|
|
211
|
-
std::string ctcModelPath;
|
|
212
|
-
if (preferInt8.has_value()) {
|
|
213
|
-
if (preferInt8.value()) {
|
|
214
|
-
if (FileExists(ctcPathInt8)) {
|
|
215
|
-
ctcModelPath = ctcPathInt8;
|
|
216
|
-
} else if (FileExists(ctcPath)) {
|
|
217
|
-
ctcModelPath = ctcPath;
|
|
218
|
-
}
|
|
219
|
-
} else {
|
|
220
|
-
if (FileExists(ctcPath)) {
|
|
221
|
-
ctcModelPath = ctcPath;
|
|
222
|
-
} else if (FileExists(ctcPathInt8)) {
|
|
223
|
-
ctcModelPath = ctcPathInt8;
|
|
224
|
-
}
|
|
225
|
-
}
|
|
226
|
-
} else {
|
|
227
|
-
if (FileExists(ctcPathInt8)) {
|
|
228
|
-
ctcModelPath = ctcPathInt8;
|
|
229
|
-
} else if (FileExists(ctcPath)) {
|
|
230
|
-
ctcModelPath = ctcPath;
|
|
231
|
-
}
|
|
232
|
-
}
|
|
233
|
-
|
|
234
|
-
bool hasTransducer = FileExists(encoderPath) &&
|
|
235
|
-
FileExists(decoderPath) &&
|
|
236
|
-
FileExists(joinerPath);
|
|
237
|
-
|
|
238
|
-
bool hasWhisperEncoder = FileExists(encoderPath) || FileExists(encoderPathInt8);
|
|
239
|
-
bool hasWhisperDecoder = FileExists(decoderPath) || FileExists(decoderPathInt8);
|
|
240
|
-
bool hasWhisper = hasWhisperEncoder && hasWhisperDecoder && !FileExists(joinerPath);
|
|
241
|
-
|
|
242
|
-
bool hasFunAsrEncoderAdaptor = FileExists(funasrEncoderAdaptor) || FileExists(funasrEncoderAdaptorInt8);
|
|
243
|
-
bool hasFunAsrLLM = FileExists(funasrLLM) || FileExists(funasrLLMInt8);
|
|
244
|
-
bool hasFunAsrEmbedding = FileExists(funasrEmbedding) || FileExists(funasrEmbeddingInt8);
|
|
245
|
-
bool hasFunAsrTokenizer = !funasrTokenizerDir.empty() && FileExists(funasrTokenizerDir + "/vocab.json");
|
|
246
|
-
bool hasFunAsrNano = hasFunAsrEncoderAdaptor && hasFunAsrLLM && hasFunAsrEmbedding && hasFunAsrTokenizer;
|
|
247
|
-
|
|
248
|
-
bool isLikelyNemoCtc = modelDir.find("nemo") != std::string::npos ||
|
|
249
|
-
modelDir.find("parakeet") != std::string::npos;
|
|
250
|
-
bool isLikelyWenetCtc = modelDir.find("wenet") != std::string::npos;
|
|
251
|
-
bool isLikelySenseVoice = modelDir.find("sense") != std::string::npos ||
|
|
252
|
-
modelDir.find("sensevoice") != std::string::npos;
|
|
253
|
-
bool isLikelyFunAsrNano = modelDir.find("funasr") != std::string::npos ||
|
|
254
|
-
modelDir.find("funasr-nano") != std::string::npos;
|
|
255
|
-
|
|
256
|
-
if (hasTransducer) {
|
|
257
|
-
result.detectedModels.push_back({"transducer", modelDir});
|
|
258
|
-
}
|
|
259
|
-
|
|
260
|
-
if (!ctcModelPath.empty() && (isLikelyNemoCtc || isLikelyWenetCtc || isLikelySenseVoice)) {
|
|
261
|
-
if (isLikelyNemoCtc) {
|
|
262
|
-
result.detectedModels.push_back({"nemo_ctc", modelDir});
|
|
263
|
-
} else if (isLikelyWenetCtc) {
|
|
264
|
-
result.detectedModels.push_back({"wenet_ctc", modelDir});
|
|
265
|
-
} else if (isLikelySenseVoice) {
|
|
266
|
-
result.detectedModels.push_back({"sense_voice", modelDir});
|
|
267
|
-
} else {
|
|
268
|
-
result.detectedModels.push_back({"ctc", modelDir});
|
|
269
|
-
}
|
|
270
|
-
} else if (!paraformerModelPath.empty()) {
|
|
271
|
-
result.detectedModels.push_back({"paraformer", modelDir});
|
|
272
|
-
}
|
|
273
|
-
|
|
274
|
-
if (hasWhisper) {
|
|
275
|
-
result.detectedModels.push_back({"whisper", modelDir});
|
|
276
|
-
}
|
|
277
|
-
|
|
278
|
-
if (hasFunAsrNano) {
|
|
279
|
-
result.detectedModels.push_back({"funasr_nano", modelDir});
|
|
280
|
-
}
|
|
281
|
-
|
|
282
|
-
SttModelKind selected = SttModelKind::kUnknown;
|
|
283
|
-
|
|
284
|
-
if (modelType.has_value() && modelType.value() != "auto") {
|
|
285
|
-
selected = ParseSttModelType(modelType.value());
|
|
286
|
-
if (selected == SttModelKind::kUnknown) {
|
|
287
|
-
result.error = "Unknown model type: " + modelType.value();
|
|
288
|
-
return result;
|
|
289
|
-
}
|
|
290
|
-
|
|
291
|
-
if (selected == SttModelKind::kTransducer && !hasTransducer) {
|
|
292
|
-
result.error = "Transducer model requested but files not found in " + modelDir;
|
|
293
|
-
return result;
|
|
294
|
-
}
|
|
295
|
-
if (selected == SttModelKind::kParaformer && paraformerModelPath.empty()) {
|
|
296
|
-
result.error = "Paraformer model requested but model.onnx not found in " + modelDir;
|
|
297
|
-
return result;
|
|
298
|
-
}
|
|
299
|
-
if ((selected == SttModelKind::kNemoCtc || selected == SttModelKind::kWenetCtc ||
|
|
300
|
-
selected == SttModelKind::kSenseVoice || selected == SttModelKind::kZipformerCtc) &&
|
|
301
|
-
ctcModelPath.empty()) {
|
|
302
|
-
result.error = "CTC model requested but model.onnx not found in " + modelDir;
|
|
303
|
-
return result;
|
|
304
|
-
}
|
|
305
|
-
if (selected == SttModelKind::kWhisper && !hasWhisper) {
|
|
306
|
-
result.error = "Whisper model requested but encoder/decoder not found in " + modelDir;
|
|
307
|
-
return result;
|
|
308
|
-
}
|
|
309
|
-
if (selected == SttModelKind::kFunAsrNano && !hasFunAsrNano) {
|
|
310
|
-
result.error = "FunASR Nano model requested but required files not found in " + modelDir;
|
|
311
|
-
return result;
|
|
312
|
-
}
|
|
313
|
-
} else {
|
|
314
|
-
if (hasTransducer) {
|
|
315
|
-
selected = SttModelKind::kTransducer;
|
|
316
|
-
} else if (!ctcModelPath.empty() && (isLikelyNemoCtc || isLikelyWenetCtc || isLikelySenseVoice)) {
|
|
317
|
-
if (isLikelyNemoCtc) {
|
|
318
|
-
selected = SttModelKind::kNemoCtc;
|
|
319
|
-
} else if (isLikelyWenetCtc) {
|
|
320
|
-
selected = SttModelKind::kWenetCtc;
|
|
321
|
-
} else {
|
|
322
|
-
selected = SttModelKind::kSenseVoice;
|
|
323
|
-
}
|
|
324
|
-
} else if (hasFunAsrNano && isLikelyFunAsrNano) {
|
|
325
|
-
selected = SttModelKind::kFunAsrNano;
|
|
326
|
-
} else if (!paraformerModelPath.empty()) {
|
|
327
|
-
selected = SttModelKind::kParaformer;
|
|
328
|
-
} else if (hasWhisper) {
|
|
329
|
-
selected = SttModelKind::kWhisper;
|
|
330
|
-
} else if (hasFunAsrNano) {
|
|
331
|
-
selected = SttModelKind::kFunAsrNano;
|
|
332
|
-
} else if (!ctcModelPath.empty()) {
|
|
333
|
-
selected = SttModelKind::kZipformerCtc;
|
|
334
|
-
}
|
|
335
|
-
}
|
|
336
|
-
|
|
337
|
-
if (selected == SttModelKind::kUnknown) {
|
|
338
|
-
result.error = "No compatible model type detected in " + modelDir;
|
|
339
|
-
return result;
|
|
340
|
-
}
|
|
341
|
-
|
|
342
|
-
result.selectedKind = selected;
|
|
343
|
-
result.tokensRequired = !(selected == SttModelKind::kWhisper || selected == SttModelKind::kFunAsrNano);
|
|
344
|
-
|
|
345
|
-
if (selected == SttModelKind::kTransducer) {
|
|
346
|
-
result.paths.encoder = encoderPath;
|
|
347
|
-
result.paths.decoder = decoderPath;
|
|
348
|
-
result.paths.joiner = joinerPath;
|
|
349
|
-
} else if (selected == SttModelKind::kParaformer) {
|
|
350
|
-
result.paths.paraformerModel = paraformerModelPath;
|
|
351
|
-
} else if (selected == SttModelKind::kNemoCtc || selected == SttModelKind::kWenetCtc ||
|
|
352
|
-
selected == SttModelKind::kSenseVoice || selected == SttModelKind::kZipformerCtc) {
|
|
353
|
-
result.paths.ctcModel = ctcModelPath;
|
|
354
|
-
} else if (selected == SttModelKind::kWhisper) {
|
|
355
|
-
result.paths.whisperEncoder = FileExists(encoderPathInt8) ? encoderPathInt8 : encoderPath;
|
|
356
|
-
result.paths.whisperDecoder = FileExists(decoderPathInt8) ? decoderPathInt8 : decoderPath;
|
|
357
|
-
} else if (selected == SttModelKind::kFunAsrNano) {
|
|
358
|
-
result.paths.funasrEncoderAdaptor = FileExists(funasrEncoderAdaptorInt8) ? funasrEncoderAdaptorInt8 : funasrEncoderAdaptor;
|
|
359
|
-
result.paths.funasrLLM = FileExists(funasrLLMInt8) ? funasrLLMInt8 : funasrLLM;
|
|
360
|
-
result.paths.funasrEmbedding = FileExists(funasrEmbeddingInt8) ? funasrEmbeddingInt8 : funasrEmbedding;
|
|
361
|
-
result.paths.funasrTokenizer = funasrTokenizerDir + "/vocab.json";
|
|
362
|
-
}
|
|
363
|
-
|
|
364
|
-
if (FileExists(tokensPath)) {
|
|
365
|
-
result.paths.tokens = tokensPath;
|
|
366
|
-
} else if (result.tokensRequired) {
|
|
367
|
-
result.error = "Tokens file not found at " + tokensPath;
|
|
368
|
-
return result;
|
|
369
|
-
}
|
|
370
|
-
|
|
371
|
-
result.ok = true;
|
|
372
|
-
return result;
|
|
373
|
-
}
|
|
374
|
-
|
|
375
|
-
TtsDetectResult DetectTtsModel(const std::string& modelDir, const std::string& modelType) {
|
|
376
|
-
TtsDetectResult result;
|
|
377
|
-
|
|
378
|
-
if (modelDir.empty()) {
|
|
379
|
-
result.error = "TTS: Model directory is empty";
|
|
380
|
-
return result;
|
|
381
|
-
}
|
|
382
|
-
|
|
383
|
-
if (!FileExists(modelDir) || !IsDirectory(modelDir)) {
|
|
384
|
-
result.error = "TTS: Model directory does not exist or is not a directory: " + modelDir;
|
|
385
|
-
return result;
|
|
386
|
-
}
|
|
387
|
-
|
|
388
|
-
std::string modelOnnx = modelDir + "/model.onnx";
|
|
389
|
-
std::string modelFp16 = modelDir + "/model.fp16.onnx";
|
|
390
|
-
std::string modelInt8 = modelDir + "/model.int8.onnx";
|
|
391
|
-
std::string tokensFile = modelDir + "/tokens.txt";
|
|
392
|
-
std::string lexiconFile = modelDir + "/lexicon.txt";
|
|
393
|
-
std::string dataDirPath = modelDir + "/espeak-ng-data";
|
|
394
|
-
std::string voicesFile = modelDir + "/voices.bin";
|
|
395
|
-
std::string acousticModel = modelDir + "/acoustic_model.onnx";
|
|
396
|
-
std::string vocoder = modelDir + "/vocoder.onnx";
|
|
397
|
-
std::string encoder = modelDir + "/encoder.onnx";
|
|
398
|
-
std::string decoder = modelDir + "/decoder.onnx";
|
|
399
|
-
|
|
400
|
-
bool hasVits = FileExists(modelOnnx) || FileExists(modelFp16) || FileExists(modelInt8);
|
|
401
|
-
bool hasMatcha = FileExists(acousticModel) && FileExists(vocoder);
|
|
402
|
-
bool hasVoicesFile = FileExists(voicesFile);
|
|
403
|
-
bool hasZipvoice = FileExists(encoder) && FileExists(decoder) && FileExists(vocoder);
|
|
404
|
-
bool hasDataDir = IsDirectory(dataDirPath);
|
|
405
|
-
|
|
406
|
-
std::string modelDirLower = ToLower(modelDir);
|
|
407
|
-
bool isLikelyKitten = modelDirLower.find("kitten") != std::string::npos;
|
|
408
|
-
bool isLikelyKokoro = modelDirLower.find("kokoro") != std::string::npos;
|
|
409
|
-
|
|
410
|
-
if (hasMatcha) {
|
|
411
|
-
result.detectedModels.push_back({"matcha", modelDir});
|
|
412
|
-
}
|
|
413
|
-
if (hasZipvoice && !hasMatcha) {
|
|
414
|
-
result.detectedModels.push_back({"zipvoice", modelDir});
|
|
415
|
-
}
|
|
416
|
-
if (hasVoicesFile) {
|
|
417
|
-
// If the directory name clearly hints at one voice backend, prefer that
|
|
418
|
-
// single candidate. If there's no clear hint, include both kokoro and
|
|
419
|
-
// kitten as possible detected models so the UI can present the choice.
|
|
420
|
-
if (isLikelyKitten && !isLikelyKokoro) {
|
|
421
|
-
result.detectedModels.push_back({"kitten", modelDir});
|
|
422
|
-
} else if (isLikelyKokoro && !isLikelyKitten) {
|
|
423
|
-
result.detectedModels.push_back({"kokoro", modelDir});
|
|
424
|
-
} else {
|
|
425
|
-
// Ambiguous: suggest both possibilities so the app can let the
|
|
426
|
-
// user decide. Do not force a single selection here.
|
|
427
|
-
result.detectedModels.push_back({"kokoro", modelDir});
|
|
428
|
-
result.detectedModels.push_back({"kitten", modelDir});
|
|
429
|
-
}
|
|
430
|
-
}
|
|
431
|
-
// Decide whether to offer VITS as a candidate. If a voices.bin is present
|
|
432
|
-
// and the directory name clearly hints at kokoro/kitten, prefer the
|
|
433
|
-
// hinted voice backend(s) and do not add VITS unless the folder also
|
|
434
|
-
// explicitly hints at VITS or the voices case is ambiguous.
|
|
435
|
-
if (hasVits) {
|
|
436
|
-
bool isLikelyVits = modelDirLower.find("vits") != std::string::npos;
|
|
437
|
-
bool voicesAmbiguous = !isLikelyKitten && !isLikelyKokoro;
|
|
438
|
-
|
|
439
|
-
bool addVits = false;
|
|
440
|
-
if (!hasVoicesFile) {
|
|
441
|
-
// No voices.bin => VITS is a valid standalone model.
|
|
442
|
-
addVits = true;
|
|
443
|
-
} else {
|
|
444
|
-
// voices.bin exists: only add VITS when folder explicitly
|
|
445
|
-
// references 'vits' or when the voices backend is ambiguous.
|
|
446
|
-
if (isLikelyVits || voicesAmbiguous) {
|
|
447
|
-
addVits = true;
|
|
448
|
-
}
|
|
449
|
-
}
|
|
450
|
-
|
|
451
|
-
if (addVits) {
|
|
452
|
-
// Avoid adding VITS multiple times if other checks added it.
|
|
453
|
-
result.detectedModels.push_back({"vits", modelDir});
|
|
454
|
-
}
|
|
455
|
-
}
|
|
456
|
-
|
|
457
|
-
TtsModelKind selected = TtsModelKind::kUnknown;
|
|
458
|
-
if (modelType != "auto") {
|
|
459
|
-
selected = ParseTtsModelType(modelType);
|
|
460
|
-
if (selected == TtsModelKind::kUnknown) {
|
|
461
|
-
result.error = "TTS: Unknown model type: " + modelType;
|
|
462
|
-
return result;
|
|
463
|
-
}
|
|
464
|
-
} else {
|
|
465
|
-
if (hasMatcha) {
|
|
466
|
-
selected = TtsModelKind::kMatcha;
|
|
467
|
-
} else if (hasZipvoice) {
|
|
468
|
-
selected = TtsModelKind::kZipvoice;
|
|
469
|
-
} else if (hasVoicesFile) {
|
|
470
|
-
if (isLikelyKitten && !isLikelyKokoro) {
|
|
471
|
-
selected = TtsModelKind::kKitten;
|
|
472
|
-
} else if (isLikelyKokoro && !isLikelyKitten) {
|
|
473
|
-
selected = TtsModelKind::kKokoro;
|
|
474
|
-
} else {
|
|
475
|
-
selected = TtsModelKind::kKokoro;
|
|
476
|
-
}
|
|
477
|
-
} else if (hasVits) {
|
|
478
|
-
selected = TtsModelKind::kVits;
|
|
479
|
-
}
|
|
480
|
-
}
|
|
481
|
-
|
|
482
|
-
if (selected == TtsModelKind::kUnknown) {
|
|
483
|
-
result.error = "TTS: No compatible model type detected in " + modelDir;
|
|
484
|
-
return result;
|
|
485
|
-
}
|
|
486
|
-
|
|
487
|
-
if (selected == TtsModelKind::kVits && !hasVits) {
|
|
488
|
-
result.error = "TTS: VITS model requested but model.onnx not found in " + modelDir;
|
|
489
|
-
return result;
|
|
490
|
-
}
|
|
491
|
-
if (selected == TtsModelKind::kMatcha && !hasMatcha) {
|
|
492
|
-
result.error = "TTS: Matcha model requested but required files not found in " + modelDir;
|
|
493
|
-
return result;
|
|
494
|
-
}
|
|
495
|
-
if ((selected == TtsModelKind::kKokoro || selected == TtsModelKind::kKitten) && (!hasVits || !hasVoicesFile)) {
|
|
496
|
-
result.error = "TTS: Kokoro/Kitten model requested but required files not found in " + modelDir;
|
|
497
|
-
return result;
|
|
498
|
-
}
|
|
499
|
-
if (selected == TtsModelKind::kZipvoice && !hasZipvoice) {
|
|
500
|
-
result.error = "TTS: Zipvoice model requested but required files not found in " + modelDir;
|
|
501
|
-
return result;
|
|
502
|
-
}
|
|
503
|
-
if ((selected == TtsModelKind::kVits || selected == TtsModelKind::kMatcha ||
|
|
504
|
-
selected == TtsModelKind::kKokoro || selected == TtsModelKind::kKitten ||
|
|
505
|
-
selected == TtsModelKind::kZipvoice) &&
|
|
506
|
-
!hasDataDir) {
|
|
507
|
-
result.error = "TTS: espeak-ng-data not found in " + modelDir +
|
|
508
|
-
". Copy espeak-ng-data into the model directory.";
|
|
509
|
-
return result;
|
|
510
|
-
}
|
|
511
|
-
|
|
512
|
-
std::string ttsModel;
|
|
513
|
-
if (FileExists(modelInt8)) {
|
|
514
|
-
ttsModel = modelInt8;
|
|
515
|
-
} else if (FileExists(modelFp16)) {
|
|
516
|
-
ttsModel = modelFp16;
|
|
517
|
-
} else if (FileExists(modelOnnx)) {
|
|
518
|
-
ttsModel = modelOnnx;
|
|
519
|
-
}
|
|
520
|
-
|
|
521
|
-
result.selectedKind = selected;
|
|
522
|
-
result.paths.ttsModel = ttsModel;
|
|
523
|
-
result.paths.tokens = tokensFile;
|
|
524
|
-
result.paths.lexicon = FileExists(lexiconFile) ? lexiconFile : "";
|
|
525
|
-
result.paths.dataDir = dataDirPath;
|
|
526
|
-
result.paths.voices = voicesFile;
|
|
527
|
-
result.paths.acousticModel = acousticModel;
|
|
528
|
-
result.paths.vocoder = vocoder;
|
|
529
|
-
result.paths.encoder = encoder;
|
|
530
|
-
result.paths.decoder = decoder;
|
|
531
|
-
|
|
532
|
-
if (!FileExists(tokensFile)) {
|
|
533
|
-
result.error = "TTS: tokens.txt not found in " + modelDir;
|
|
534
|
-
return result;
|
|
535
|
-
}
|
|
536
|
-
|
|
537
|
-
result.ok = true;
|
|
538
|
-
return result;
|
|
539
|
-
}
|
|
540
|
-
|
|
541
|
-
} // namespace sherpaonnx
|