react-native-sherpa-onnx 0.3.2 → 0.3.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +84 -77
- package/SherpaOnnx.podspec +79 -45
- package/android/build.gradle +8 -2
- package/android/prebuilt-download.gradle +70 -16
- package/android/prebuilt-versions.gradle +14 -6
- package/android/src/main/cpp/CMakeLists.txt +2 -0
- package/android/src/main/cpp/jni/audio/sherpa-onnx-audio-convert-jni.cpp +202 -328
- package/android/src/main/cpp/jni/model_detect/sherpa-onnx-detect-jni-common.cpp +22 -0
- package/android/src/main/cpp/jni/model_detect/sherpa-onnx-detect-jni-common.h +2 -0
- package/android/src/main/cpp/jni/model_detect/sherpa-onnx-model-detect-helper.cpp +96 -142
- package/android/src/main/cpp/jni/model_detect/sherpa-onnx-model-detect-helper.h +40 -4
- package/android/src/main/cpp/jni/model_detect/sherpa-onnx-model-detect-stt.cpp +774 -316
- package/android/src/main/cpp/jni/model_detect/sherpa-onnx-model-detect-tts.cpp +208 -122
- package/android/src/main/cpp/jni/model_detect/sherpa-onnx-model-detect.h +92 -0
- package/android/src/main/cpp/jni/model_detect/sherpa-onnx-stt-wrapper.cpp +3 -0
- package/android/src/main/cpp/jni/model_detect/sherpa-onnx-tts-wrapper.cpp +14 -2
- package/android/src/main/cpp/jni/model_detect/sherpa-onnx-validate-stt.cpp +229 -0
- package/android/src/main/cpp/jni/model_detect/sherpa-onnx-validate-stt.h +38 -0
- package/android/src/main/cpp/jni/model_detect/sherpa-onnx-validate-tts.cpp +144 -0
- package/android/src/main/cpp/jni/model_detect/sherpa-onnx-validate-tts.h +38 -0
- package/android/src/main/cpp/jni/module/sherpa-onnx-module-jni.cpp +1 -1
- package/android/src/main/java/com/sherpaonnx/SherpaOnnxModule.kt +157 -11
- package/android/src/main/java/com/sherpaonnx/SherpaOnnxPcmCapture.kt +150 -0
- package/android/src/main/java/com/sherpaonnx/SherpaOnnxSttHelper.kt +75 -24
- package/android/src/main/java/com/sherpaonnx/SherpaOnnxTtsHelper.kt +52 -1
- package/ios/SherpaOnnx+PcmLiveStream.mm +288 -0
- package/ios/SherpaOnnx+STT.mm +2 -0
- package/ios/SherpaOnnx+TTS.mm +17 -0
- package/ios/SherpaOnnx.mm +27 -3
- package/ios/SherpaOnnxAudioConvert.h +28 -0
- package/ios/SherpaOnnxAudioConvert.mm +698 -0
- package/ios/archive/sherpa-onnx-archive-helper.mm +12 -0
- package/ios/model_detect/sherpa-onnx-model-detect-helper.h +37 -3
- package/ios/model_detect/sherpa-onnx-model-detect-helper.mm +80 -45
- package/ios/model_detect/sherpa-onnx-model-detect-stt.mm +629 -267
- package/ios/model_detect/sherpa-onnx-model-detect-tts.mm +148 -56
- package/ios/model_detect/sherpa-onnx-model-detect.h +72 -0
- package/ios/model_detect/sherpa-onnx-validate-stt.h +38 -0
- package/ios/model_detect/sherpa-onnx-validate-stt.mm +229 -0
- package/ios/model_detect/sherpa-onnx-validate-tts.h +38 -0
- package/ios/model_detect/sherpa-onnx-validate-tts.mm +144 -0
- package/ios/stt/sherpa-onnx-stt-wrapper.mm +4 -0
- package/lib/module/NativeSherpaOnnx.js.map +1 -1
- package/lib/module/audio/index.js +55 -1
- package/lib/module/audio/index.js.map +1 -1
- package/lib/module/download/ModelDownloadManager.js +14 -0
- package/lib/module/download/ModelDownloadManager.js.map +1 -1
- package/lib/module/index.js +10 -0
- package/lib/module/index.js.map +1 -1
- package/lib/module/stt/streaming.js +6 -3
- package/lib/module/stt/streaming.js.map +1 -1
- package/lib/module/tts/index.js +13 -1
- package/lib/module/tts/index.js.map +1 -1
- package/lib/typescript/src/NativeSherpaOnnx.d.ts +32 -3
- package/lib/typescript/src/NativeSherpaOnnx.d.ts.map +1 -1
- package/lib/typescript/src/audio/index.d.ts +20 -1
- package/lib/typescript/src/audio/index.d.ts.map +1 -1
- package/lib/typescript/src/download/ModelDownloadManager.d.ts +2 -1
- package/lib/typescript/src/download/ModelDownloadManager.d.ts.map +1 -1
- package/lib/typescript/src/index.d.ts +10 -0
- package/lib/typescript/src/index.d.ts.map +1 -1
- package/lib/typescript/src/stt/streaming.d.ts.map +1 -1
- package/lib/typescript/src/stt/streamingTypes.d.ts +1 -1
- package/lib/typescript/src/stt/streamingTypes.d.ts.map +1 -1
- package/lib/typescript/src/tts/index.d.ts +12 -1
- package/lib/typescript/src/tts/index.d.ts.map +1 -1
- package/package.json +6 -1
- package/scripts/check-model-csvs.sh +72 -0
- package/scripts/setup-ios-framework.sh +272 -191
- package/src/NativeSherpaOnnx.ts +37 -3
- package/src/audio/index.ts +84 -1
- package/src/download/ModelDownloadManager.ts +19 -0
- package/src/index.tsx +15 -0
- package/src/stt/streaming.ts +10 -5
- package/src/stt/streamingTypes.ts +1 -1
- package/src/tts/index.ts +25 -1
- package/third_party/ffmpeg_prebuilt/ANDROID_RELEASE_TAG +1 -1
- package/third_party/libarchive_prebuilt/ANDROID_RELEASE_TAG +1 -1
- package/third_party/libarchive_prebuilt/IOS_RELEASE_TAG +1 -1
- package/third_party/sherpa-onnx-prebuilt/ANDROID_RELEASE_TAG +1 -1
- package/third_party/sherpa-onnx-prebuilt/IOS_RELEASE_TAG +1 -1
- package/ios/scripts/patch-libarchive-includes.sh +0 -61
- package/ios/scripts/setup-ios-libarchive.sh +0 -98
|
@@ -1,23 +1,83 @@
|
|
|
1
1
|
/**
|
|
2
2
|
* sherpa-onnx-model-detect-stt.cpp
|
|
3
3
|
*
|
|
4
|
-
* Purpose: Detects STT model type and fills SttModelPaths from a model directory.
|
|
5
|
-
* transducer, paraformer, whisper,
|
|
4
|
+
* Purpose: Detects STT model type and fills SttModelPaths from a model directory. Used by
|
|
5
|
+
* nativeDetectSttModel (module-jni). Supports transducer, paraformer, whisper, moonshine, etc.
|
|
6
|
+
*
|
|
7
|
+
* --- Detection pipeline (overview) ---
|
|
8
|
+
*
|
|
9
|
+
* 1. Gather files in modelDir (recursive), then:
|
|
10
|
+
* - SttCandidatePaths: map file names to logical paths (encoder, decoder, joiner, moonshine
|
|
11
|
+
* preprocessor/encoder/mergedDecoder, paraformer/ctc model, tokens, etc.).
|
|
12
|
+
* - SttPathHints: from directory name only (isLikelyMoonshine, isLikelyNemo, ...).
|
|
13
|
+
* - SttCapabilities: which model types are *possible* given paths + hints (hasWhisper,
|
|
14
|
+
* hasMoonshineV2, hasTransducer, ...). Multiple can be true at once (e.g. same files
|
|
15
|
+
* can satisfy both Whisper and Moonshine v2).
|
|
16
|
+
*
|
|
17
|
+
* 2. detectedModels (for UI "Select model type"): built from capabilities only. Every kind
|
|
18
|
+
* with has* == true is added. So the list shows all types that could work with the files,
|
|
19
|
+
* not the single chosen type.
|
|
20
|
+
*
|
|
21
|
+
* 3. selectedKind (which type we actually use): from ResolveSttKind():
|
|
22
|
+
* - If modelType is explicit (e.g. "whisper"): use it if capabilities allow.
|
|
23
|
+
* - If modelType == "auto": Priority 1 = folder name (GetKindsFromDirName: tokens like
|
|
24
|
+
* "moonshine", "whisper" in dir name --> candidate kinds). Priority 2 = among those
|
|
25
|
+
* candidates, pick the first that CapabilitySupportsKind(). Fallback = if no name
|
|
26
|
+
* candidates, use file-only order (transducer --> moonshine v2/v1 --> CTC --> paraformer -->
|
|
27
|
+
* whisper --> ...).
|
|
28
|
+
*
|
|
29
|
+
* 4. paths: ApplyPathsForSttKind(selectedKind) copies the relevant candidate paths into
|
|
30
|
+
* SttModelPaths (encoder/decoder, moonshine encoder/mergedDecoder, etc.) for the chosen kind.
|
|
31
|
+
*
|
|
32
|
+
* Result to caller: ok, error, detectedModels (list), selectedKind (single), paths (for selectedKind).
|
|
6
33
|
*/
|
|
7
34
|
#include "sherpa-onnx-model-detect.h"
|
|
8
35
|
#include "sherpa-onnx-model-detect-helper.h"
|
|
9
|
-
#include
|
|
36
|
+
#include "sherpa-onnx-validate-stt.h"
|
|
10
37
|
#include <cstdlib>
|
|
11
38
|
#include <string>
|
|
12
39
|
#include <algorithm>
|
|
13
|
-
|
|
40
|
+
#ifdef __ANDROID__
|
|
41
|
+
#include <android/log.h>
|
|
14
42
|
#define LOG_TAG "SttModelDetect"
|
|
15
43
|
#define LOGI(...) __android_log_print(ANDROID_LOG_INFO, LOG_TAG, __VA_ARGS__)
|
|
16
44
|
#define LOGE(...) __android_log_print(ANDROID_LOG_ERROR, LOG_TAG, __VA_ARGS__)
|
|
45
|
+
#else
|
|
46
|
+
#define LOGI(...) ((void)0)
|
|
47
|
+
#define LOGE(...) ((void)0)
|
|
48
|
+
#endif
|
|
17
49
|
|
|
18
50
|
namespace sherpaonnx {
|
|
19
51
|
namespace {
|
|
20
52
|
|
|
53
|
+
static const char* KindToName(SttModelKind k) {
|
|
54
|
+
switch (k) {
|
|
55
|
+
case SttModelKind::kTransducer: return "transducer";
|
|
56
|
+
case SttModelKind::kNemoTransducer: return "nemo_transducer";
|
|
57
|
+
case SttModelKind::kParaformer: return "paraformer";
|
|
58
|
+
case SttModelKind::kNemoCtc: return "nemo_ctc";
|
|
59
|
+
case SttModelKind::kWenetCtc: return "wenet_ctc";
|
|
60
|
+
case SttModelKind::kSenseVoice: return "sense_voice";
|
|
61
|
+
case SttModelKind::kZipformerCtc: return "zipformer_ctc";
|
|
62
|
+
case SttModelKind::kWhisper: return "whisper";
|
|
63
|
+
case SttModelKind::kFunAsrNano: return "funasr_nano";
|
|
64
|
+
case SttModelKind::kFireRedAsr: return "fire_red_asr";
|
|
65
|
+
case SttModelKind::kMoonshine: return "moonshine";
|
|
66
|
+
case SttModelKind::kMoonshineV2: return "moonshine_v2";
|
|
67
|
+
case SttModelKind::kDolphin: return "dolphin";
|
|
68
|
+
case SttModelKind::kCanary: return "canary";
|
|
69
|
+
case SttModelKind::kOmnilingual: return "omnilingual";
|
|
70
|
+
case SttModelKind::kMedAsr: return "medasr";
|
|
71
|
+
case SttModelKind::kTeleSpeechCtc: return "telespeech_ctc";
|
|
72
|
+
case SttModelKind::kToneCtc: return "tone_ctc";
|
|
73
|
+
default: return "unknown";
|
|
74
|
+
}
|
|
75
|
+
}
|
|
76
|
+
|
|
77
|
+
static const char* EmptyOrPath(const std::string& s) {
|
|
78
|
+
return s.empty() ? "(empty)" : s.c_str();
|
|
79
|
+
}
|
|
80
|
+
|
|
21
81
|
SttModelKind ParseSttModelType(const std::string& modelType) {
|
|
22
82
|
if (modelType == "transducer") return SttModelKind::kTransducer;
|
|
23
83
|
if (modelType == "nemo_transducer") return SttModelKind::kNemoTransducer;
|
|
@@ -30,6 +90,7 @@ SttModelKind ParseSttModelType(const std::string& modelType) {
|
|
|
30
90
|
if (modelType == "funasr_nano") return SttModelKind::kFunAsrNano;
|
|
31
91
|
if (modelType == "fire_red_asr") return SttModelKind::kFireRedAsr;
|
|
32
92
|
if (modelType == "moonshine") return SttModelKind::kMoonshine;
|
|
93
|
+
if (modelType == "moonshine_v2") return SttModelKind::kMoonshineV2;
|
|
33
94
|
if (modelType == "dolphin") return SttModelKind::kDolphin;
|
|
34
95
|
if (modelType == "canary") return SttModelKind::kCanary;
|
|
35
96
|
if (modelType == "omnilingual") return SttModelKind::kOmnilingual;
|
|
@@ -39,6 +100,561 @@ SttModelKind ParseSttModelType(const std::string& modelType) {
|
|
|
39
100
|
return SttModelKind::kUnknown;
|
|
40
101
|
}
|
|
41
102
|
|
|
103
|
+
/** Returns true if \p cap and hints/paths support the given \p kind (required files present). */
|
|
104
|
+
static bool CapabilitySupportsKind(
|
|
105
|
+
SttModelKind kind,
|
|
106
|
+
const SttCapabilities& cap,
|
|
107
|
+
const SttPathHints& hints,
|
|
108
|
+
const SttCandidatePaths& paths
|
|
109
|
+
) {
|
|
110
|
+
switch (kind) {
|
|
111
|
+
case SttModelKind::kTransducer:
|
|
112
|
+
return cap.hasTransducer && !(hints.isLikelyNemo || hints.isLikelyTdt);
|
|
113
|
+
case SttModelKind::kNemoTransducer:
|
|
114
|
+
return cap.hasTransducer;
|
|
115
|
+
case SttModelKind::kParaformer:
|
|
116
|
+
return cap.hasParaformer;
|
|
117
|
+
case SttModelKind::kNemoCtc:
|
|
118
|
+
return !paths.ctcModel.empty() && hints.isLikelyNemo;
|
|
119
|
+
case SttModelKind::kWenetCtc:
|
|
120
|
+
return !paths.ctcModel.empty() && hints.isLikelyWenetCtc;
|
|
121
|
+
case SttModelKind::kSenseVoice:
|
|
122
|
+
return !paths.ctcModel.empty() && hints.isLikelySenseVoice;
|
|
123
|
+
case SttModelKind::kZipformerCtc:
|
|
124
|
+
return !paths.ctcModel.empty() && hints.isLikelyZipformer;
|
|
125
|
+
case SttModelKind::kWhisper:
|
|
126
|
+
return cap.hasWhisper;
|
|
127
|
+
case SttModelKind::kFunAsrNano:
|
|
128
|
+
return cap.hasFunAsrNano;
|
|
129
|
+
case SttModelKind::kFireRedAsr:
|
|
130
|
+
return cap.hasFireRedAsr;
|
|
131
|
+
case SttModelKind::kMoonshine:
|
|
132
|
+
return cap.hasMoonshine;
|
|
133
|
+
case SttModelKind::kMoonshineV2:
|
|
134
|
+
return cap.hasMoonshineV2;
|
|
135
|
+
case SttModelKind::kDolphin:
|
|
136
|
+
return cap.hasDolphin;
|
|
137
|
+
case SttModelKind::kCanary:
|
|
138
|
+
return cap.hasCanary;
|
|
139
|
+
case SttModelKind::kOmnilingual:
|
|
140
|
+
return cap.hasOmnilingual;
|
|
141
|
+
case SttModelKind::kMedAsr:
|
|
142
|
+
return cap.hasMedAsr;
|
|
143
|
+
case SttModelKind::kTeleSpeechCtc:
|
|
144
|
+
return cap.hasTeleSpeechCtc;
|
|
145
|
+
case SttModelKind::kToneCtc:
|
|
146
|
+
return cap.hasToneCtc;
|
|
147
|
+
default:
|
|
148
|
+
return false;
|
|
149
|
+
}
|
|
150
|
+
}
|
|
151
|
+
|
|
152
|
+
/**
|
|
153
|
+
* Priority 1: Collect candidate STT kinds from the model directory name (last path component).
|
|
154
|
+
* Tokens like "moonshine", "whisper", "paraformer" are matched case-insensitively. Returns
|
|
155
|
+
* candidates in a fixed priority order so that when multiple kinds match the name, file-based
|
|
156
|
+
* disambiguation picks the first supported one.
|
|
157
|
+
*/
|
|
158
|
+
static std::vector<SttModelKind> GetKindsFromDirName(const std::string& modelDir) {
|
|
159
|
+
using namespace model_detect;
|
|
160
|
+
size_t pos = modelDir.find_last_of("/\\");
|
|
161
|
+
std::string base = (pos == std::string::npos) ? modelDir : modelDir.substr(pos + 1);
|
|
162
|
+
std::string lower = ToLower(base);
|
|
163
|
+
|
|
164
|
+
std::vector<SttModelKind> out;
|
|
165
|
+
auto add = [&out](SttModelKind k) {
|
|
166
|
+
if (std::find(out.begin(), out.end(), k) == out.end())
|
|
167
|
+
out.push_back(k);
|
|
168
|
+
};
|
|
169
|
+
|
|
170
|
+
if (lower.find("moonshine") != std::string::npos) {
|
|
171
|
+
add(SttModelKind::kMoonshineV2);
|
|
172
|
+
add(SttModelKind::kMoonshine);
|
|
173
|
+
}
|
|
174
|
+
if (lower.find("whisper") != std::string::npos)
|
|
175
|
+
add(SttModelKind::kWhisper);
|
|
176
|
+
if (lower.find("paraformer") != std::string::npos)
|
|
177
|
+
add(SttModelKind::kParaformer);
|
|
178
|
+
if (lower.find("nemo") != std::string::npos || lower.find("parakeet") != std::string::npos) {
|
|
179
|
+
add(SttModelKind::kNemoTransducer);
|
|
180
|
+
add(SttModelKind::kNemoCtc);
|
|
181
|
+
}
|
|
182
|
+
if (lower.find("tdt") != std::string::npos)
|
|
183
|
+
add(SttModelKind::kNemoTransducer);
|
|
184
|
+
if (lower.find("wenet") != std::string::npos)
|
|
185
|
+
add(SttModelKind::kWenetCtc);
|
|
186
|
+
if (lower.find("sense") != std::string::npos || lower.find("sensevoice") != std::string::npos)
|
|
187
|
+
add(SttModelKind::kSenseVoice);
|
|
188
|
+
if (lower.find("zipformer") != std::string::npos) {
|
|
189
|
+
add(SttModelKind::kTransducer);
|
|
190
|
+
add(SttModelKind::kZipformerCtc);
|
|
191
|
+
}
|
|
192
|
+
if (lower.find("funasr") != std::string::npos)
|
|
193
|
+
add(SttModelKind::kFunAsrNano);
|
|
194
|
+
if (lower.find("canary") != std::string::npos)
|
|
195
|
+
add(SttModelKind::kCanary);
|
|
196
|
+
if (lower.find("fire_red") != std::string::npos || lower.find("fire-red") != std::string::npos)
|
|
197
|
+
add(SttModelKind::kFireRedAsr);
|
|
198
|
+
if (lower.find("dolphin") != std::string::npos)
|
|
199
|
+
add(SttModelKind::kDolphin);
|
|
200
|
+
if (lower.find("omnilingual") != std::string::npos)
|
|
201
|
+
add(SttModelKind::kOmnilingual);
|
|
202
|
+
if (lower.find("medasr") != std::string::npos)
|
|
203
|
+
add(SttModelKind::kMedAsr);
|
|
204
|
+
if (lower.find("telespeech") != std::string::npos)
|
|
205
|
+
add(SttModelKind::kTeleSpeechCtc);
|
|
206
|
+
if (lower.find("t-one") != std::string::npos || lower.find("t_one") != std::string::npos ||
|
|
207
|
+
model_detect::ContainsWord(lower, "tone"))
|
|
208
|
+
add(SttModelKind::kToneCtc);
|
|
209
|
+
if (lower.find("transducer") != std::string::npos) {
|
|
210
|
+
add(SttModelKind::kTransducer);
|
|
211
|
+
add(SttModelKind::kNemoTransducer);
|
|
212
|
+
}
|
|
213
|
+
|
|
214
|
+
return out;
|
|
215
|
+
}
|
|
216
|
+
|
|
217
|
+
static SttCandidatePaths GatherSttCandidatePaths(
|
|
218
|
+
const std::vector<model_detect::FileEntry>& files,
|
|
219
|
+
const std::string& modelDir,
|
|
220
|
+
const std::optional<bool>& preferInt8
|
|
221
|
+
) {
|
|
222
|
+
using namespace model_detect;
|
|
223
|
+
SttCandidatePaths p;
|
|
224
|
+
p.encoder = FindOnnxByAnyToken(files, {"encoder"}, preferInt8);
|
|
225
|
+
p.decoder = FindOnnxByAnyToken(files, {"decoder"}, preferInt8);
|
|
226
|
+
p.joiner = FindOnnxByAnyToken(files, {"joiner"}, preferInt8);
|
|
227
|
+
p.funasrEncoderAdaptor = FindOnnxByAnyToken(files, {"encoder_adaptor", "encoder-adaptor"}, preferInt8);
|
|
228
|
+
p.funasrLLM = FindOnnxByAnyToken(files, {"llm"}, preferInt8);
|
|
229
|
+
p.funasrEmbedding = FindOnnxByAnyToken(files, {"embedding"}, preferInt8);
|
|
230
|
+
{
|
|
231
|
+
std::string vocabInSubdir;
|
|
232
|
+
const std::string vocabName = "vocab.json";
|
|
233
|
+
for (const auto& entry : files) {
|
|
234
|
+
if (entry.nameLower != vocabName) continue;
|
|
235
|
+
const std::string& path = entry.path;
|
|
236
|
+
if (path.size() >= modelDir.size() && path.compare(0, modelDir.size(), modelDir) == 0 &&
|
|
237
|
+
(modelDir.empty() || path[modelDir.size()] == '/')) {
|
|
238
|
+
if (path.size() == modelDir.size() + 12 && path.compare(modelDir.size(), 12, "/vocab.json") == 0) {
|
|
239
|
+
p.funasrTokenizerDir = modelDir;
|
|
240
|
+
break;
|
|
241
|
+
}
|
|
242
|
+
if (vocabInSubdir.empty())
|
|
243
|
+
vocabInSubdir = path;
|
|
244
|
+
}
|
|
245
|
+
}
|
|
246
|
+
if (p.funasrTokenizerDir.empty() && !vocabInSubdir.empty()) {
|
|
247
|
+
size_t lastSlash = vocabInSubdir.find_last_of("/\\");
|
|
248
|
+
if (lastSlash != std::string::npos)
|
|
249
|
+
p.funasrTokenizerDir = vocabInSubdir.substr(0, lastSlash);
|
|
250
|
+
}
|
|
251
|
+
}
|
|
252
|
+
p.moonshinePreprocessor = FindOnnxByAnyToken(files, {"preprocess", "preprocessor"}, preferInt8);
|
|
253
|
+
p.moonshineEncoder = FindOnnxByAnyToken(files, {"encode", "encoder_model"}, preferInt8);
|
|
254
|
+
p.moonshineUncachedDecoder = FindOnnxByAnyToken(files, {"uncached_decode", "uncached"}, preferInt8);
|
|
255
|
+
p.moonshineCachedDecoder = FindOnnxByAnyTokenExcluding(
|
|
256
|
+
files, std::vector<std::string>{"cached_decode", "cached"}, std::vector<std::string>{"uncached"}, preferInt8);
|
|
257
|
+
p.moonshineMergedDecoder = FindOnnxByAnyToken(files, {"merged_decode", "merged_decoder", "decoder_model_merged", "merged"}, preferInt8);
|
|
258
|
+
static const std::vector<std::string> modelExcludes = {
|
|
259
|
+
"encoder", "decoder", "joiner", "vocoder", "acoustic", "embedding", "llm",
|
|
260
|
+
"encoder_adaptor", "encoder-adaptor", "encoder_model", "decoder_model",
|
|
261
|
+
"merged_decoder", "decoder_model_merged", "preprocess", "encode", "uncached", "cached"
|
|
262
|
+
};
|
|
263
|
+
p.paraformerModel = FindOnnxByAnyToken(files, {"model"}, preferInt8);
|
|
264
|
+
if (!p.paraformerModel.empty()) {
|
|
265
|
+
std::string lower = ToLower(p.paraformerModel);
|
|
266
|
+
if (lower.find("encoder_model") != std::string::npos ||
|
|
267
|
+
lower.find("decoder_model") != std::string::npos ||
|
|
268
|
+
lower.find("merged_decoder") != std::string::npos)
|
|
269
|
+
p.paraformerModel.clear();
|
|
270
|
+
}
|
|
271
|
+
if (p.paraformerModel.empty())
|
|
272
|
+
p.paraformerModel = FindLargestOnnxExcludingTokens(files, modelExcludes);
|
|
273
|
+
p.ctcModel = FindOnnxByAnyToken(files, {"model"}, preferInt8);
|
|
274
|
+
if (!p.ctcModel.empty()) {
|
|
275
|
+
std::string lower = ToLower(p.ctcModel);
|
|
276
|
+
if (lower.find("encoder_model") != std::string::npos ||
|
|
277
|
+
lower.find("decoder_model") != std::string::npos ||
|
|
278
|
+
lower.find("merged_decoder") != std::string::npos)
|
|
279
|
+
p.ctcModel.clear();
|
|
280
|
+
}
|
|
281
|
+
if (p.ctcModel.empty())
|
|
282
|
+
p.ctcModel = FindLargestOnnxExcludingTokens(files, modelExcludes);
|
|
283
|
+
if (!p.paraformerModel.empty() &&
|
|
284
|
+
(p.paraformerModel == p.encoder || p.paraformerModel == p.decoder || p.paraformerModel == p.joiner))
|
|
285
|
+
p.paraformerModel.clear();
|
|
286
|
+
if (!p.ctcModel.empty() &&
|
|
287
|
+
(p.ctcModel == p.encoder || p.ctcModel == p.decoder || p.ctcModel == p.joiner))
|
|
288
|
+
p.ctcModel.clear();
|
|
289
|
+
p.tokens = FindFileEndingWith(files, "tokens.txt");
|
|
290
|
+
p.bpeVocab = FindFileByName(files, "bpe.vocab");
|
|
291
|
+
p.encoderForV2 = p.encoder.empty() ? FindOnnxByAnyToken(files, {"encoder", "encoder_model"}, preferInt8) : p.encoder;
|
|
292
|
+
|
|
293
|
+
return p;
|
|
294
|
+
}
|
|
295
|
+
|
|
296
|
+
static SttPathHints GetSttPathHints(const std::string& modelDir) {
|
|
297
|
+
using namespace model_detect;
|
|
298
|
+
SttPathHints h;
|
|
299
|
+
std::string lower = ToLower(modelDir);
|
|
300
|
+
h.isLikelyNemo = lower.find("nemo") != std::string::npos || lower.find("parakeet") != std::string::npos;
|
|
301
|
+
h.isLikelyTdt = lower.find("tdt") != std::string::npos;
|
|
302
|
+
h.isLikelyWenetCtc = lower.find("wenet") != std::string::npos;
|
|
303
|
+
h.isLikelySenseVoice = lower.find("sense") != std::string::npos || lower.find("sensevoice") != std::string::npos;
|
|
304
|
+
h.isLikelyFunAsrNano = lower.find("funasr") != std::string::npos || lower.find("funasr-nano") != std::string::npos;
|
|
305
|
+
h.isLikelyZipformer = lower.find("zipformer") != std::string::npos;
|
|
306
|
+
h.isLikelyMoonshine = lower.find("moonshine") != std::string::npos;
|
|
307
|
+
h.isLikelyDolphin = lower.find("dolphin") != std::string::npos;
|
|
308
|
+
h.isLikelyFireRedAsr = lower.find("fire_red") != std::string::npos || lower.find("fire-red") != std::string::npos;
|
|
309
|
+
h.isLikelyCanary = lower.find("canary") != std::string::npos;
|
|
310
|
+
h.isLikelyOmnilingual = lower.find("omnilingual") != std::string::npos;
|
|
311
|
+
h.isLikelyMedAsr = lower.find("medasr") != std::string::npos;
|
|
312
|
+
h.isLikelyTeleSpeech = lower.find("telespeech") != std::string::npos;
|
|
313
|
+
// tone_ctc is for T-One models only (e.g. streaming-t-one-russian). WeNetSpeech CTC (yue, wu, etc.) uses wenet_ctc per sherpa-onnx docs.
|
|
314
|
+
h.isLikelyToneCtc = lower.find("t-one") != std::string::npos || lower.find("t_one") != std::string::npos ||
|
|
315
|
+
ContainsWord(lower, "tone");
|
|
316
|
+
h.isLikelyParaformer = lower.find("paraformer") != std::string::npos;
|
|
317
|
+
h.isLikelyVad = lower.find("vad") != std::string::npos || lower.find("silero") != std::string::npos ||
|
|
318
|
+
lower.find("ten-vad") != std::string::npos;
|
|
319
|
+
h.isLikelyTdnn = lower.find("tdnn") != std::string::npos;
|
|
320
|
+
return h;
|
|
321
|
+
}
|
|
322
|
+
|
|
323
|
+
/**
|
|
324
|
+
* QNN (asr-models-qnn-binary): Find model assets and set the correct candidate slot using the
|
|
325
|
+
* given path hints.
|
|
326
|
+
* - Single model.bin -> paraformerModel or ctcModel.
|
|
327
|
+
* - Paraformer with encoder.bin + predictor.bin + decoder.bin (no model.bin): set paraformerModel
|
|
328
|
+
* to "encoder.bin path,predictor.bin path,decoder.bin path" (sherpa-onnx OfflineParaformerModelConfig
|
|
329
|
+
* accepts this format for QNN; see offline-paraformer-model-config.cc).
|
|
330
|
+
* Caller must pass hints from GetSttPathHints (no duplicate call).
|
|
331
|
+
*/
|
|
332
|
+
static void ApplyQnnBinaryModel(
|
|
333
|
+
const std::vector<model_detect::FileEntry>& files,
|
|
334
|
+
const std::string& modelDir,
|
|
335
|
+
const SttPathHints& hints,
|
|
336
|
+
SttCandidatePaths& candidate
|
|
337
|
+
) {
|
|
338
|
+
using namespace model_detect;
|
|
339
|
+
std::string modelbin = FindFileByName(files, "model.bin");
|
|
340
|
+
if (modelbin.empty()) {
|
|
341
|
+
for (const auto& entry : files) {
|
|
342
|
+
if (entry.nameLower.size() >= 9 &&
|
|
343
|
+
entry.nameLower.find("model") != std::string::npos &&
|
|
344
|
+
(entry.nameLower.compare(entry.nameLower.size() - 4, 4, ".bin") == 0)) {
|
|
345
|
+
modelbin = entry.path;
|
|
346
|
+
break;
|
|
347
|
+
}
|
|
348
|
+
}
|
|
349
|
+
}
|
|
350
|
+
if (modelbin.empty()) {
|
|
351
|
+
const std::string prefix = modelDir + "/";
|
|
352
|
+
for (const auto& entry : files) {
|
|
353
|
+
if (entry.path.size() > prefix.size() &&
|
|
354
|
+
entry.path.compare(0, prefix.size(), prefix) == 0 &&
|
|
355
|
+
entry.path.find('/', prefix.size()) == std::string::npos &&
|
|
356
|
+
entry.nameLower.size() >= 4 &&
|
|
357
|
+
entry.nameLower.compare(entry.nameLower.size() - 4, 4, ".bin") == 0) {
|
|
358
|
+
modelbin = entry.path;
|
|
359
|
+
break;
|
|
360
|
+
}
|
|
361
|
+
}
|
|
362
|
+
}
|
|
363
|
+
if (!modelbin.empty()) {
|
|
364
|
+
if (hints.isLikelyParaformer)
|
|
365
|
+
candidate.paraformerModel = modelbin;
|
|
366
|
+
else if (candidate.ctcModel.empty())
|
|
367
|
+
candidate.ctcModel = modelbin;
|
|
368
|
+
return;
|
|
369
|
+
}
|
|
370
|
+
// Paraformer QNN with encoder.bin + predictor.bin + decoder.bin (sherpa-onnx expects
|
|
371
|
+
// model="encoder.bin,predictor.bin,decoder.bin" for this case).
|
|
372
|
+
if (hints.isLikelyParaformer) {
|
|
373
|
+
std::string enc = FindFileByName(files, "encoder.bin");
|
|
374
|
+
std::string pred = FindFileByName(files, "predictor.bin");
|
|
375
|
+
std::string dec = FindFileByName(files, "decoder.bin");
|
|
376
|
+
if (!enc.empty() && !pred.empty() && !dec.empty()) {
|
|
377
|
+
candidate.paraformerModel = enc + "," + pred + "," + dec;
|
|
378
|
+
}
|
|
379
|
+
}
|
|
380
|
+
}
|
|
381
|
+
|
|
382
|
+
/** Error message when model is for unsupported hardware (RK35xx, Ascend, etc.). */
|
|
383
|
+
static const char* kHardwareSpecificUnsupportedMessage =
|
|
384
|
+
"This model is built for hardware-specific acceleration (e.g. RK35xx, Ascend, CANN) and is not supported by the React Native SDK. Use an ONNX model for CPU/GPU or a QNN-capable model on supported devices.";
|
|
385
|
+
|
|
386
|
+
/** True if model dir name indicates a hardware-specific build (e.g. RK3588, Ascend). Not runnable on generic host. QNN is supported by the SDK. */
|
|
387
|
+
static bool IsHardwareSpecificModelDir(const std::string& modelDir) {
|
|
388
|
+
using namespace model_detect;
|
|
389
|
+
std::string lower = ToLower(modelDir);
|
|
390
|
+
const char* tokens[] = {
|
|
391
|
+
"rk3588", "rk3576", "rk3568", "rk3566", "rk3562", "rknn",
|
|
392
|
+
"ascend", "cann", "910b", "910b2", "310p3"
|
|
393
|
+
};
|
|
394
|
+
for (const char* t : tokens) {
|
|
395
|
+
if (lower.find(t) != std::string::npos)
|
|
396
|
+
return true;
|
|
397
|
+
}
|
|
398
|
+
return false;
|
|
399
|
+
}
|
|
400
|
+
|
|
401
|
+
static SttCapabilities ComputeSttCapabilities(const SttCandidatePaths& paths, const SttPathHints& hints) {
|
|
402
|
+
using namespace model_detect;
|
|
403
|
+
SttCapabilities c;
|
|
404
|
+
c.hasTransducer = !paths.encoder.empty() && !paths.decoder.empty() && !paths.joiner.empty();
|
|
405
|
+
bool hasWhisperEnc = !paths.encoder.empty();
|
|
406
|
+
bool hasWhisperDec = !paths.decoder.empty();
|
|
407
|
+
c.hasWhisper = hasWhisperEnc && hasWhisperDec && paths.joiner.empty();
|
|
408
|
+
bool hasFunAsrTok = !paths.funasrTokenizerDir.empty();
|
|
409
|
+
c.hasFunAsrNano = !paths.funasrEncoderAdaptor.empty() && !paths.funasrLLM.empty() &&
|
|
410
|
+
!paths.funasrEmbedding.empty() && hasFunAsrTok;
|
|
411
|
+
c.hasMoonshine = !paths.moonshinePreprocessor.empty() && !paths.moonshineUncachedDecoder.empty() &&
|
|
412
|
+
!paths.moonshineCachedDecoder.empty() && !paths.moonshineEncoder.empty();
|
|
413
|
+
c.hasMoonshineV2 = !paths.moonshineMergedDecoder.empty() && !paths.encoderForV2.empty() && paths.joiner.empty();
|
|
414
|
+
// Streaming paraformer uses encoder.onnx + decoder.onnx (no joiner, no single "model.onnx").
|
|
415
|
+
c.hasParaformer = !paths.paraformerModel.empty() ||
|
|
416
|
+
(hints.isLikelyParaformer && hasWhisperEnc && hasWhisperDec && paths.joiner.empty());
|
|
417
|
+
c.hasDolphin = hints.isLikelyDolphin && !paths.ctcModel.empty();
|
|
418
|
+
// Fire Red ASR: only encoder+decoder (two files). Single-file Fire Red (e.g. fire-red-asr2-ctc) uses CTC path to avoid native crash.
|
|
419
|
+
c.hasFireRedAsr = (c.hasTransducer || (hasWhisperEnc && hasWhisperDec && paths.joiner.empty())) && hints.isLikelyFireRedAsr;
|
|
420
|
+
c.hasFireRedCtc = hints.isLikelyFireRedAsr && paths.encoder.empty() && paths.decoder.empty() &&
|
|
421
|
+
(!paths.ctcModel.empty() || !paths.paraformerModel.empty());
|
|
422
|
+
c.hasCanary = hasWhisperEnc && hasWhisperDec && paths.joiner.empty() && hints.isLikelyCanary;
|
|
423
|
+
c.hasOmnilingual = !paths.ctcModel.empty() && hints.isLikelyOmnilingual;
|
|
424
|
+
c.hasMedAsr = !paths.ctcModel.empty() && hints.isLikelyMedAsr;
|
|
425
|
+
c.hasTeleSpeechCtc = (!paths.ctcModel.empty() || !paths.paraformerModel.empty()) && hints.isLikelyTeleSpeech;
|
|
426
|
+
c.hasToneCtc = !paths.ctcModel.empty() && hints.isLikelyToneCtc;
|
|
427
|
+
return c;
|
|
428
|
+
}
|
|
429
|
+
|
|
430
|
+
static void CollectDetectedModels(
|
|
431
|
+
std::vector<DetectedModel>& out,
|
|
432
|
+
const SttCapabilities& cap,
|
|
433
|
+
const SttPathHints& hints,
|
|
434
|
+
const SttCandidatePaths& paths,
|
|
435
|
+
const std::string& modelDir
|
|
436
|
+
) {
|
|
437
|
+
if (cap.hasTransducer) {
|
|
438
|
+
out.push_back({(hints.isLikelyNemo || hints.isLikelyTdt) ? "nemo_transducer" : "transducer", modelDir});
|
|
439
|
+
}
|
|
440
|
+
if (!paths.ctcModel.empty() && (hints.isLikelyNemo || hints.isLikelyWenetCtc || hints.isLikelySenseVoice || hints.isLikelyZipformer)) {
|
|
441
|
+
if (hints.isLikelyNemo) out.push_back({"nemo_ctc", modelDir});
|
|
442
|
+
else if (hints.isLikelyWenetCtc) out.push_back({"wenet_ctc", modelDir});
|
|
443
|
+
else if (hints.isLikelySenseVoice) out.push_back({"sense_voice", modelDir});
|
|
444
|
+
else out.push_back({"zipformer_ctc", modelDir});
|
|
445
|
+
} else if (!paths.paraformerModel.empty()) {
|
|
446
|
+
out.push_back({"paraformer", modelDir});
|
|
447
|
+
}
|
|
448
|
+
if (cap.hasWhisper) out.push_back({"whisper", modelDir});
|
|
449
|
+
if (cap.hasFunAsrNano) out.push_back({"funasr_nano", modelDir});
|
|
450
|
+
if (cap.hasMoonshine) out.push_back({"moonshine", modelDir});
|
|
451
|
+
if (cap.hasMoonshineV2) out.push_back({"moonshine_v2", modelDir});
|
|
452
|
+
if (cap.hasDolphin) out.push_back({"dolphin", modelDir});
|
|
453
|
+
if (cap.hasFireRedAsr) out.push_back({"fire_red_asr", modelDir});
|
|
454
|
+
if (cap.hasCanary) out.push_back({"canary", modelDir});
|
|
455
|
+
if (cap.hasOmnilingual) out.push_back({"omnilingual", modelDir});
|
|
456
|
+
if (cap.hasMedAsr) out.push_back({"medasr", modelDir});
|
|
457
|
+
if (cap.hasTeleSpeechCtc) out.push_back({"telespeech_ctc", modelDir});
|
|
458
|
+
if (cap.hasToneCtc) out.push_back({"tone_ctc", modelDir});
|
|
459
|
+
}
|
|
460
|
+
|
|
461
|
+
static SttModelKind ResolveSttKind(
|
|
462
|
+
const std::optional<std::string>& modelType,
|
|
463
|
+
const SttCapabilities& cap,
|
|
464
|
+
const SttPathHints& hints,
|
|
465
|
+
const SttCandidatePaths& paths,
|
|
466
|
+
const std::string& modelDir,
|
|
467
|
+
std::string& outError
|
|
468
|
+
) {
|
|
469
|
+
outError.clear();
|
|
470
|
+
if (hints.isLikelyVad) {
|
|
471
|
+
outError = "VAD models are not yet supported by the React Native SDK.";
|
|
472
|
+
return SttModelKind::kUnknown;
|
|
473
|
+
}
|
|
474
|
+
if (hints.isLikelyTdnn) {
|
|
475
|
+
outError = "TDNN (keyword/yesno) models are not yet supported by the React Native SDK.";
|
|
476
|
+
return SttModelKind::kUnknown;
|
|
477
|
+
}
|
|
478
|
+
if (modelType.has_value() && modelType.value() != "auto") {
|
|
479
|
+
SttModelKind selected = ParseSttModelType(modelType.value());
|
|
480
|
+
if (selected == SttModelKind::kUnknown) {
|
|
481
|
+
outError = "Unknown model type: " + modelType.value();
|
|
482
|
+
return SttModelKind::kUnknown;
|
|
483
|
+
}
|
|
484
|
+
if (selected == SttModelKind::kTransducer && !cap.hasTransducer) {
|
|
485
|
+
outError = "Transducer model requested but files not found in " + modelDir;
|
|
486
|
+
return SttModelKind::kUnknown;
|
|
487
|
+
}
|
|
488
|
+
if (selected == SttModelKind::kNemoTransducer && !cap.hasTransducer) {
|
|
489
|
+
outError = "NeMo Transducer model requested but encoder/decoder/joiner not found in " + modelDir;
|
|
490
|
+
return SttModelKind::kUnknown;
|
|
491
|
+
}
|
|
492
|
+
if (selected == SttModelKind::kParaformer && !cap.hasParaformer) {
|
|
493
|
+
outError = "Paraformer model requested but model file (or encoder+decoder for streaming) not found in " + modelDir;
|
|
494
|
+
return SttModelKind::kUnknown;
|
|
495
|
+
}
|
|
496
|
+
if ((selected == SttModelKind::kNemoCtc || selected == SttModelKind::kWenetCtc ||
|
|
497
|
+
selected == SttModelKind::kSenseVoice || selected == SttModelKind::kZipformerCtc ||
|
|
498
|
+
selected == SttModelKind::kToneCtc) && paths.ctcModel.empty()) {
|
|
499
|
+
outError = "CTC model requested but model file not found in " + modelDir;
|
|
500
|
+
return SttModelKind::kUnknown;
|
|
501
|
+
}
|
|
502
|
+
if (selected == SttModelKind::kWhisper && !cap.hasWhisper) {
|
|
503
|
+
outError = "Whisper model requested but encoder/decoder not found in " + modelDir;
|
|
504
|
+
return SttModelKind::kUnknown;
|
|
505
|
+
}
|
|
506
|
+
if (selected == SttModelKind::kFunAsrNano && !cap.hasFunAsrNano) {
|
|
507
|
+
outError = "FunASR Nano model requested but required files not found in " + modelDir;
|
|
508
|
+
return SttModelKind::kUnknown;
|
|
509
|
+
}
|
|
510
|
+
if (selected == SttModelKind::kMoonshine && !cap.hasMoonshine) {
|
|
511
|
+
outError = "Moonshine v1 model requested but preprocess/encode/uncached_decode/cached_decode not found in " + modelDir;
|
|
512
|
+
return SttModelKind::kUnknown;
|
|
513
|
+
}
|
|
514
|
+
if (selected == SttModelKind::kMoonshineV2 && !cap.hasMoonshineV2) {
|
|
515
|
+
outError = "Moonshine v2 model requested but encoder/merged_decode not found in " + modelDir;
|
|
516
|
+
return SttModelKind::kUnknown;
|
|
517
|
+
}
|
|
518
|
+
if (selected == SttModelKind::kDolphin && !cap.hasDolphin) {
|
|
519
|
+
outError = "Dolphin model requested but model not found in " + modelDir;
|
|
520
|
+
return SttModelKind::kUnknown;
|
|
521
|
+
}
|
|
522
|
+
if (selected == SttModelKind::kFireRedAsr && !cap.hasFireRedAsr) {
|
|
523
|
+
outError = "FireRed ASR model requested but encoder/decoder not found in " + modelDir;
|
|
524
|
+
return SttModelKind::kUnknown;
|
|
525
|
+
}
|
|
526
|
+
if (selected == SttModelKind::kCanary && !cap.hasCanary) {
|
|
527
|
+
outError = "Canary model requested but encoder/decoder not found in " + modelDir;
|
|
528
|
+
return SttModelKind::kUnknown;
|
|
529
|
+
}
|
|
530
|
+
if (selected == SttModelKind::kOmnilingual && !cap.hasOmnilingual) {
|
|
531
|
+
outError = "Omnilingual model requested but model not found in " + modelDir;
|
|
532
|
+
return SttModelKind::kUnknown;
|
|
533
|
+
}
|
|
534
|
+
if (selected == SttModelKind::kMedAsr && !cap.hasMedAsr) {
|
|
535
|
+
outError = "MedASR model requested but model not found in " + modelDir;
|
|
536
|
+
return SttModelKind::kUnknown;
|
|
537
|
+
}
|
|
538
|
+
if (selected == SttModelKind::kTeleSpeechCtc && !cap.hasTeleSpeechCtc) {
|
|
539
|
+
outError = "TeleSpeech CTC model requested but model not found in " + modelDir;
|
|
540
|
+
return SttModelKind::kUnknown;
|
|
541
|
+
}
|
|
542
|
+
if (selected == SttModelKind::kToneCtc && !cap.hasToneCtc) {
|
|
543
|
+
outError = "Tone CTC model requested but path does not contain 'tone' (as a word), 't-one', or 't_one' (e.g. sherpa-onnx-streaming-t-one-*) in " + modelDir;
|
|
544
|
+
return SttModelKind::kUnknown;
|
|
545
|
+
}
|
|
546
|
+
return selected;
|
|
547
|
+
}
|
|
548
|
+
|
|
549
|
+
// Auto: Priority 1 – resolve from folder name candidates; Priority 2 – file-based disambiguation.
|
|
550
|
+
std::vector<SttModelKind> nameCandidates = GetKindsFromDirName(modelDir);
|
|
551
|
+
if (!nameCandidates.empty()) {
|
|
552
|
+
for (SttModelKind k : nameCandidates) {
|
|
553
|
+
if (CapabilitySupportsKind(k, cap, hints, paths))
|
|
554
|
+
return k;
|
|
555
|
+
}
|
|
556
|
+
// Name hinted at a model type but no candidate had required files; fall through to file-only.
|
|
557
|
+
}
|
|
558
|
+
|
|
559
|
+
// Fallback: no name-based candidates, or none supported – use file-only detection order.
|
|
560
|
+
if (cap.hasTransducer) {
|
|
561
|
+
return (hints.isLikelyNemo || hints.isLikelyTdt) ? SttModelKind::kNemoTransducer : SttModelKind::kTransducer;
|
|
562
|
+
}
|
|
563
|
+
if (hints.isLikelyMoonshine && cap.hasMoonshineV2) return SttModelKind::kMoonshineV2;
|
|
564
|
+
if (hints.isLikelyMoonshine && cap.hasMoonshine) return SttModelKind::kMoonshine;
|
|
565
|
+
if (!paths.ctcModel.empty() && (hints.isLikelyToneCtc || hints.isLikelyNemo || hints.isLikelyWenetCtc || hints.isLikelySenseVoice)) {
|
|
566
|
+
if (hints.isLikelyToneCtc) return SttModelKind::kToneCtc;
|
|
567
|
+
if (hints.isLikelyNemo) return SttModelKind::kNemoCtc;
|
|
568
|
+
if (hints.isLikelyWenetCtc) return SttModelKind::kWenetCtc;
|
|
569
|
+
return SttModelKind::kSenseVoice;
|
|
570
|
+
}
|
|
571
|
+
if (cap.hasFunAsrNano && hints.isLikelyFunAsrNano) return SttModelKind::kFunAsrNano;
|
|
572
|
+
if (cap.hasFireRedCtc) return SttModelKind::kZipformerCtc;
|
|
573
|
+
if (!paths.paraformerModel.empty()) return SttModelKind::kParaformer;
|
|
574
|
+
if (cap.hasCanary) return SttModelKind::kCanary;
|
|
575
|
+
if (cap.hasFireRedAsr) return SttModelKind::kFireRedAsr;
|
|
576
|
+
if (cap.hasWhisper) return SttModelKind::kWhisper;
|
|
577
|
+
if (cap.hasFunAsrNano) return SttModelKind::kFunAsrNano;
|
|
578
|
+
if (cap.hasMoonshineV2) return SttModelKind::kMoonshineV2;
|
|
579
|
+
if (cap.hasDolphin) return SttModelKind::kDolphin;
|
|
580
|
+
if (cap.hasOmnilingual) return SttModelKind::kOmnilingual;
|
|
581
|
+
if (cap.hasMedAsr) return SttModelKind::kMedAsr;
|
|
582
|
+
if (cap.hasTeleSpeechCtc) return SttModelKind::kTeleSpeechCtc;
|
|
583
|
+
if (cap.hasToneCtc) return SttModelKind::kToneCtc;
|
|
584
|
+
if (!paths.ctcModel.empty()) return SttModelKind::kZipformerCtc;
|
|
585
|
+
return SttModelKind::kUnknown;
|
|
586
|
+
}
|
|
587
|
+
|
|
588
|
+
static void ApplyPathsForSttKind(SttModelKind kind, const SttCandidatePaths& candidate, SttModelPaths& resultPaths) {
|
|
589
|
+
switch (kind) {
|
|
590
|
+
case SttModelKind::kTransducer:
|
|
591
|
+
case SttModelKind::kNemoTransducer:
|
|
592
|
+
resultPaths.encoder = candidate.encoder;
|
|
593
|
+
resultPaths.decoder = candidate.decoder;
|
|
594
|
+
resultPaths.joiner = candidate.joiner;
|
|
595
|
+
break;
|
|
596
|
+
case SttModelKind::kParaformer:
|
|
597
|
+
resultPaths.paraformerModel = candidate.paraformerModel;
|
|
598
|
+
// Streaming paraformer: encoder.onnx + decoder.onnx (no single model.onnx).
|
|
599
|
+
if (resultPaths.paraformerModel.empty() && !candidate.encoder.empty() && !candidate.decoder.empty()) {
|
|
600
|
+
resultPaths.encoder = candidate.encoder;
|
|
601
|
+
resultPaths.decoder = candidate.decoder;
|
|
602
|
+
}
|
|
603
|
+
break;
|
|
604
|
+
case SttModelKind::kNemoCtc:
|
|
605
|
+
case SttModelKind::kWenetCtc:
|
|
606
|
+
case SttModelKind::kSenseVoice:
|
|
607
|
+
case SttModelKind::kZipformerCtc:
|
|
608
|
+
case SttModelKind::kToneCtc:
|
|
609
|
+
resultPaths.ctcModel = candidate.ctcModel;
|
|
610
|
+
break;
|
|
611
|
+
case SttModelKind::kWhisper:
|
|
612
|
+
resultPaths.whisperEncoder = candidate.encoder;
|
|
613
|
+
resultPaths.whisperDecoder = candidate.decoder;
|
|
614
|
+
break;
|
|
615
|
+
case SttModelKind::kFunAsrNano:
|
|
616
|
+
resultPaths.funasrEncoderAdaptor = candidate.funasrEncoderAdaptor;
|
|
617
|
+
resultPaths.funasrLLM = candidate.funasrLLM;
|
|
618
|
+
resultPaths.funasrEmbedding = candidate.funasrEmbedding;
|
|
619
|
+
resultPaths.funasrTokenizer = candidate.funasrTokenizerDir;
|
|
620
|
+
break;
|
|
621
|
+
case SttModelKind::kMoonshine:
|
|
622
|
+
resultPaths.moonshinePreprocessor = candidate.moonshinePreprocessor;
|
|
623
|
+
resultPaths.moonshineEncoder = candidate.moonshineEncoder;
|
|
624
|
+
resultPaths.moonshineUncachedDecoder = candidate.moonshineUncachedDecoder;
|
|
625
|
+
resultPaths.moonshineCachedDecoder = candidate.moonshineCachedDecoder;
|
|
626
|
+
break;
|
|
627
|
+
case SttModelKind::kMoonshineV2:
|
|
628
|
+
resultPaths.moonshineEncoder = candidate.encoderForV2;
|
|
629
|
+
resultPaths.moonshineMergedDecoder = candidate.moonshineMergedDecoder;
|
|
630
|
+
break;
|
|
631
|
+
case SttModelKind::kDolphin:
|
|
632
|
+
resultPaths.dolphinModel = candidate.ctcModel.empty() ? candidate.paraformerModel : candidate.ctcModel;
|
|
633
|
+
break;
|
|
634
|
+
case SttModelKind::kFireRedAsr: {
|
|
635
|
+
std::string singleModel = candidate.paraformerModel.empty() ? candidate.ctcModel : candidate.paraformerModel;
|
|
636
|
+
resultPaths.fireRedEncoder = candidate.encoder.empty() ? singleModel : candidate.encoder;
|
|
637
|
+
resultPaths.fireRedDecoder = candidate.decoder.empty() ? singleModel : candidate.decoder;
|
|
638
|
+
break;
|
|
639
|
+
}
|
|
640
|
+
case SttModelKind::kCanary:
|
|
641
|
+
resultPaths.canaryEncoder = candidate.encoder;
|
|
642
|
+
resultPaths.canaryDecoder = candidate.decoder;
|
|
643
|
+
break;
|
|
644
|
+
case SttModelKind::kOmnilingual:
|
|
645
|
+
resultPaths.omnilingualModel = candidate.ctcModel;
|
|
646
|
+
break;
|
|
647
|
+
case SttModelKind::kMedAsr:
|
|
648
|
+
resultPaths.medasrModel = candidate.ctcModel;
|
|
649
|
+
break;
|
|
650
|
+
case SttModelKind::kTeleSpeechCtc:
|
|
651
|
+
resultPaths.telespeechCtcModel = candidate.ctcModel.empty() ? candidate.paraformerModel : candidate.ctcModel;
|
|
652
|
+
break;
|
|
653
|
+
default:
|
|
654
|
+
break;
|
|
655
|
+
}
|
|
656
|
+
}
|
|
657
|
+
|
|
42
658
|
} // namespace
|
|
43
659
|
|
|
44
660
|
SttDetectResult DetectSttModel(
|
|
@@ -71,344 +687,186 @@ SttDetectResult DetectSttModel(
|
|
|
71
687
|
// Depth 4 supports layouts like root/data/lang_bpe_500/tokens.txt (icefall, k2)
|
|
72
688
|
const int kMaxSearchDepth = 4;
|
|
73
689
|
const auto files = ListFilesRecursive(modelDir, kMaxSearchDepth);
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
if (verbose) {
|
|
690
|
+
if (debug) {
|
|
691
|
+
LOGI("DetectSttModel: Found %zu files in %s", files.size(), modelDir.c_str());
|
|
77
692
|
for (const auto& f : files) {
|
|
78
693
|
LOGI(" file: %s (size=%llu)", f.path.c_str(), (unsigned long long)f.size);
|
|
79
694
|
}
|
|
80
|
-
}
|
|
81
|
-
LOGI("(detailed file listing suppressed; enable by passing debug=true to initialize())");
|
|
82
|
-
}
|
|
83
|
-
|
|
84
|
-
std::string encoderPath = FindOnnxByAnyToken(files, {"encoder"}, preferInt8);
|
|
85
|
-
std::string decoderPath = FindOnnxByAnyToken(files, {"decoder"}, preferInt8);
|
|
86
|
-
std::string joinerPath = FindOnnxByAnyToken(files, {"joiner"}, preferInt8);
|
|
87
|
-
|
|
88
|
-
LOGI("DetectSttModel: encoder=%s, decoder=%s, joiner=%s",
|
|
89
|
-
encoderPath.c_str(), decoderPath.c_str(), joinerPath.c_str());
|
|
90
|
-
|
|
91
|
-
std::string funasrEncoderAdaptor = FindOnnxByAnyToken(files, {"encoder_adaptor", "encoder-adaptor"}, preferInt8);
|
|
92
|
-
std::string funasrLLM = FindOnnxByAnyToken(files, {"llm"}, preferInt8);
|
|
93
|
-
std::string funasrEmbedding = FindOnnxByAnyToken(files, {"embedding"}, preferInt8);
|
|
94
|
-
|
|
95
|
-
std::string funasrTokenizerDir = ResolveTokenizerDir(modelDir);
|
|
96
|
-
|
|
97
|
-
// Moonshine: preprocess, encode, uncached_decode, cached_decode
|
|
98
|
-
std::string moonshinePreprocessor = FindOnnxByAnyToken(files, {"preprocess", "preprocessor"}, preferInt8);
|
|
99
|
-
std::string moonshineEncoder = FindOnnxByAnyToken(files, {"encode"}, preferInt8);
|
|
100
|
-
std::string moonshineUncachedDecoder = FindOnnxByAnyToken(files, {"uncached_decode", "uncached"}, preferInt8);
|
|
101
|
-
std::string moonshineCachedDecoder = FindOnnxByAnyToken(files, {"cached_decode", "cached"}, preferInt8);
|
|
102
|
-
|
|
103
|
-
std::vector<std::string> modelExcludes = {
|
|
104
|
-
"encoder",
|
|
105
|
-
"decoder",
|
|
106
|
-
"joiner",
|
|
107
|
-
"vocoder",
|
|
108
|
-
"acoustic",
|
|
109
|
-
"embedding",
|
|
110
|
-
"llm",
|
|
111
|
-
"encoder_adaptor",
|
|
112
|
-
"encoder-adaptor"
|
|
113
|
-
};
|
|
695
|
+
}
|
|
114
696
|
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
bool hasWhisper = hasWhisperEncoder && hasWhisperDecoder && joinerPath.empty();
|
|
142
|
-
|
|
143
|
-
bool hasFunAsrEncoderAdaptor = !funasrEncoderAdaptor.empty();
|
|
144
|
-
bool hasFunAsrLLM = !funasrLLM.empty();
|
|
145
|
-
bool hasFunAsrEmbedding = !funasrEmbedding.empty();
|
|
146
|
-
bool hasFunAsrTokenizer = !funasrTokenizerDir.empty() && FileExists(funasrTokenizerDir + "/vocab.json");
|
|
147
|
-
bool hasFunAsrNano = hasFunAsrEncoderAdaptor && hasFunAsrLLM && hasFunAsrEmbedding && hasFunAsrTokenizer;
|
|
148
|
-
|
|
149
|
-
// Case-insensitive path hints so "Nemo parakeet Tdt CTC 110m EN" etc. are recognized
|
|
150
|
-
std::string modelDirLower = model_detect::ToLower(modelDir);
|
|
151
|
-
bool isLikelyNemo = modelDirLower.find("nemo") != std::string::npos ||
|
|
152
|
-
modelDirLower.find("parakeet") != std::string::npos;
|
|
153
|
-
bool isLikelyTdt = modelDirLower.find("tdt") != std::string::npos;
|
|
154
|
-
bool isLikelyWenetCtc = modelDirLower.find("wenet") != std::string::npos;
|
|
155
|
-
bool isLikelySenseVoice = modelDirLower.find("sense") != std::string::npos ||
|
|
156
|
-
modelDirLower.find("sensevoice") != std::string::npos;
|
|
157
|
-
bool isLikelyFunAsrNano = modelDirLower.find("funasr") != std::string::npos ||
|
|
158
|
-
modelDirLower.find("funasr-nano") != std::string::npos;
|
|
159
|
-
bool isLikelyMoonshine = modelDirLower.find("moonshine") != std::string::npos;
|
|
160
|
-
bool isLikelyDolphin = modelDirLower.find("dolphin") != std::string::npos;
|
|
161
|
-
bool isLikelyFireRedAsr = modelDirLower.find("fire_red") != std::string::npos ||
|
|
162
|
-
modelDirLower.find("fire-red") != std::string::npos;
|
|
163
|
-
bool isLikelyCanary = modelDirLower.find("canary") != std::string::npos;
|
|
164
|
-
bool isLikelyOmnilingual = modelDirLower.find("omnilingual") != std::string::npos;
|
|
165
|
-
bool isLikelyMedAsr = modelDirLower.find("medasr") != std::string::npos;
|
|
166
|
-
bool isLikelyTeleSpeech = modelDirLower.find("telespeech") != std::string::npos;
|
|
167
|
-
// Tone CTC: match "tone" only as standalone word (not e.g. "cantonese"); also accept "t-one" / "t_one"
|
|
168
|
-
bool isLikelyToneCtc = modelDirLower.find("t-one") != std::string::npos ||
|
|
169
|
-
modelDirLower.find("t_one") != std::string::npos ||
|
|
170
|
-
model_detect::ContainsWord(modelDirLower, "tone");
|
|
171
|
-
|
|
172
|
-
bool hasMoonshine = !moonshinePreprocessor.empty() && !moonshineUncachedDecoder.empty() &&
|
|
173
|
-
!moonshineCachedDecoder.empty() && !moonshineEncoder.empty();
|
|
174
|
-
bool hasDolphin = isLikelyDolphin && !ctcModelPath.empty();
|
|
175
|
-
bool hasFireRedAsr = hasTransducer && isLikelyFireRedAsr;
|
|
176
|
-
// Canary (NeMo Canary) uses encoder + decoder without joiner; same file pattern as Whisper but path contains "canary"
|
|
177
|
-
bool hasCanary = hasWhisperEncoder && hasWhisperDecoder && joinerPath.empty() && isLikelyCanary;
|
|
178
|
-
bool hasOmnilingual = !ctcModelPath.empty() && isLikelyOmnilingual;
|
|
179
|
-
bool hasMedAsr = !ctcModelPath.empty() && isLikelyMedAsr;
|
|
180
|
-
bool hasTeleSpeechCtc = (!ctcModelPath.empty() || !paraformerModelPath.empty()) && isLikelyTeleSpeech;
|
|
181
|
-
bool hasToneCtc = !ctcModelPath.empty() && isLikelyToneCtc;
|
|
182
|
-
|
|
183
|
-
if (hasTransducer) {
|
|
184
|
-
if (isLikelyNemo || isLikelyTdt) {
|
|
185
|
-
result.detectedModels.push_back({"nemo_transducer", modelDir});
|
|
186
|
-
} else {
|
|
187
|
-
result.detectedModels.push_back({"transducer", modelDir});
|
|
188
|
-
}
|
|
697
|
+
SttCandidatePaths candidate = GatherSttCandidatePaths(files, modelDir, preferInt8);
|
|
698
|
+
SttPathHints hints = GetSttPathHints(modelDir);
|
|
699
|
+
ApplyQnnBinaryModel(files, modelDir, hints, candidate);
|
|
700
|
+
SttCapabilities cap = ComputeSttCapabilities(candidate, hints);
|
|
701
|
+
if (debug) {
|
|
702
|
+
LOGI("DetectSttModel: tokens=%s", EmptyOrPath(candidate.tokens));
|
|
703
|
+
LOGI("DetectSttModel: transducer encoder=%s decoder=%s joiner=%s",
|
|
704
|
+
EmptyOrPath(candidate.encoder), EmptyOrPath(candidate.decoder), EmptyOrPath(candidate.joiner));
|
|
705
|
+
LOGI("DetectSttModel: paraformerModel=%s ctcModel=%s tokens=%s bpeVocab=%s",
|
|
706
|
+
EmptyOrPath(candidate.paraformerModel), EmptyOrPath(candidate.ctcModel), EmptyOrPath(candidate.tokens), EmptyOrPath(candidate.bpeVocab));
|
|
707
|
+
LOGI("DetectSttModel: moonshine preprocessor=%s encoder=%s uncachedDecoder=%s cachedDecoder=%s mergedDecoder=%s",
|
|
708
|
+
EmptyOrPath(candidate.moonshinePreprocessor), EmptyOrPath(candidate.moonshineEncoder), EmptyOrPath(candidate.moonshineUncachedDecoder),
|
|
709
|
+
EmptyOrPath(candidate.moonshineCachedDecoder), EmptyOrPath(candidate.moonshineMergedDecoder));
|
|
710
|
+
LOGI("DetectSttModel: whisper encoder=%s decoder=%s (same as transducer; joiner empty => whisper)",
|
|
711
|
+
EmptyOrPath(candidate.encoder), EmptyOrPath(candidate.decoder));
|
|
712
|
+
LOGI("DetectSttModel: funasr encoderAdaptor=%s llm=%s embedding=%s tokenizerDir=%s",
|
|
713
|
+
EmptyOrPath(candidate.funasrEncoderAdaptor), EmptyOrPath(candidate.funasrLLM), EmptyOrPath(candidate.funasrEmbedding), EmptyOrPath(candidate.funasrTokenizerDir));
|
|
714
|
+
LOGI("DetectSttModel: hasTransducer=%d hasWhisper=%d hasMoonshine=%d hasMoonshineV2=%d hasParaformer=%d hasFunAsrNano=%d hasDolphin=%d hasFireRedAsr=%d hasFireRedCtc=%d hasCanary=%d hasOmnilingual=%d hasMedAsr=%d hasTeleSpeechCtc=%d hasToneCtc=%d",
|
|
715
|
+
(int)cap.hasTransducer, (int)cap.hasWhisper, (int)cap.hasMoonshine, (int)cap.hasMoonshineV2,
|
|
716
|
+
(int)cap.hasParaformer, (int)cap.hasFunAsrNano, (int)cap.hasDolphin, (int)cap.hasFireRedAsr, (int)cap.hasFireRedCtc,
|
|
717
|
+
(int)cap.hasCanary, (int)cap.hasOmnilingual, (int)cap.hasMedAsr, (int)cap.hasTeleSpeechCtc, (int)cap.hasToneCtc);
|
|
718
|
+
LOGI("DetectSttModel: hints isLikelyNemo=%d isLikelyTdt=%d isLikelyWenetCtc=%d isLikelySenseVoice=%d isLikelyFunAsrNano=%d isLikelyZipformer=%d isLikelyMoonshine=%d isLikelyDolphin=%d isLikelyFireRedAsr=%d isLikelyCanary=%d isLikelyOmnilingual=%d isLikelyMedAsr=%d isLikelyTeleSpeech=%d isLikelyToneCtc=%d isLikelyParaformer=%d isLikelyVad=%d isLikelyTdnn=%d",
|
|
719
|
+
(int)hints.isLikelyNemo, (int)hints.isLikelyTdt, (int)hints.isLikelyWenetCtc, (int)hints.isLikelySenseVoice,
|
|
720
|
+
(int)hints.isLikelyFunAsrNano, (int)hints.isLikelyZipformer, (int)hints.isLikelyMoonshine, (int)hints.isLikelyDolphin,
|
|
721
|
+
(int)hints.isLikelyFireRedAsr, (int)hints.isLikelyCanary, (int)hints.isLikelyOmnilingual, (int)hints.isLikelyMedAsr,
|
|
722
|
+
(int)hints.isLikelyTeleSpeech, (int)hints.isLikelyToneCtc, (int)hints.isLikelyParaformer, (int)hints.isLikelyVad, (int)hints.isLikelyTdnn);
|
|
189
723
|
}
|
|
190
724
|
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
result.
|
|
198
|
-
|
|
199
|
-
|
|
725
|
+
CollectDetectedModels(result.detectedModels, cap, hints, candidate, modelDir);
|
|
726
|
+
|
|
727
|
+
result.selectedKind = ResolveSttKind(modelType, cap, hints, candidate, modelDir, result.error);
|
|
728
|
+
if (result.selectedKind == SttModelKind::kUnknown) {
|
|
729
|
+
if (IsHardwareSpecificModelDir(modelDir)) {
|
|
730
|
+
result.ok = false;
|
|
731
|
+
result.isHardwareSpecificUnsupported = true;
|
|
732
|
+
result.error = kHardwareSpecificUnsupportedMessage;
|
|
733
|
+
LOGE("%s", result.error.c_str());
|
|
734
|
+
return result;
|
|
735
|
+
}
|
|
736
|
+
if (!result.error.empty()) {
|
|
737
|
+
LOGE("%s", result.error.c_str());
|
|
738
|
+
return result;
|
|
739
|
+
}
|
|
740
|
+
result.error = "No compatible model type detected in " + modelDir;
|
|
741
|
+
LOGE("%s", result.error.c_str());
|
|
742
|
+
if (debug) {
|
|
743
|
+
for (const auto& f : files)
|
|
744
|
+
LOGI(" file: %s (size=%llu)", f.path.c_str(), (unsigned long long)f.size);
|
|
200
745
|
}
|
|
201
|
-
|
|
202
|
-
result.detectedModels.push_back({"paraformer", modelDir});
|
|
746
|
+
return result;
|
|
203
747
|
}
|
|
204
748
|
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
749
|
+
LOGI("DetectSttModel: selected kind=%d (%s)", static_cast<int>(result.selectedKind), KindToName(result.selectedKind));
|
|
750
|
+
result.tokensRequired = (result.selectedKind != SttModelKind::kFunAsrNano);
|
|
751
|
+
ApplyPathsForSttKind(result.selectedKind, candidate, result.paths);
|
|
208
752
|
|
|
209
|
-
if (
|
|
210
|
-
result.
|
|
211
|
-
}
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
if (hasDolphin) {
|
|
216
|
-
result.detectedModels.push_back({"dolphin", modelDir});
|
|
217
|
-
}
|
|
218
|
-
if (hasFireRedAsr) {
|
|
219
|
-
result.detectedModels.push_back({"fire_red_asr", modelDir});
|
|
220
|
-
}
|
|
221
|
-
if (hasCanary) {
|
|
222
|
-
result.detectedModels.push_back({"canary", modelDir});
|
|
753
|
+
if (!candidate.tokens.empty() && FileExists(candidate.tokens)) {
|
|
754
|
+
result.paths.tokens = candidate.tokens;
|
|
755
|
+
} else if (result.tokensRequired) {
|
|
756
|
+
result.error = "Tokens file not found in " + modelDir;
|
|
757
|
+
LOGE("%s", result.error.c_str());
|
|
758
|
+
return result;
|
|
223
759
|
}
|
|
224
|
-
if (
|
|
225
|
-
result.
|
|
760
|
+
if (!candidate.bpeVocab.empty() && FileExists(candidate.bpeVocab)) {
|
|
761
|
+
result.paths.bpeVocab = candidate.bpeVocab;
|
|
226
762
|
}
|
|
227
|
-
|
|
228
|
-
|
|
763
|
+
|
|
764
|
+
auto validation = ValidateSttPaths(result.selectedKind, result.paths, modelDir);
|
|
765
|
+
if (!validation.ok) {
|
|
766
|
+
result.ok = false;
|
|
767
|
+
result.error = validation.error;
|
|
768
|
+
LOGE("%s", result.error.c_str());
|
|
769
|
+
return result;
|
|
229
770
|
}
|
|
230
|
-
|
|
231
|
-
|
|
771
|
+
|
|
772
|
+
// Log paths actually set for the selected kind (so we can verify nothing is missing).
|
|
773
|
+
switch (result.selectedKind) {
|
|
774
|
+
case SttModelKind::kTransducer:
|
|
775
|
+
case SttModelKind::kNemoTransducer:
|
|
776
|
+
LOGI("DetectSttModel: paths set encoder=%s decoder=%s joiner=%s",
|
|
777
|
+
EmptyOrPath(result.paths.encoder), EmptyOrPath(result.paths.decoder), EmptyOrPath(result.paths.joiner));
|
|
778
|
+
break;
|
|
779
|
+
case SttModelKind::kParaformer:
|
|
780
|
+
LOGI("DetectSttModel: paths set paraformerModel=%s", EmptyOrPath(result.paths.paraformerModel));
|
|
781
|
+
break;
|
|
782
|
+
case SttModelKind::kWhisper:
|
|
783
|
+
LOGI("DetectSttModel: paths set whisperEncoder=%s whisperDecoder=%s",
|
|
784
|
+
EmptyOrPath(result.paths.whisperEncoder), EmptyOrPath(result.paths.whisperDecoder));
|
|
785
|
+
break;
|
|
786
|
+
case SttModelKind::kMoonshine:
|
|
787
|
+
LOGI("DetectSttModel: paths set moonshine preprocessor=%s encoder=%s uncachedDecoder=%s cachedDecoder=%s",
|
|
788
|
+
EmptyOrPath(result.paths.moonshinePreprocessor), EmptyOrPath(result.paths.moonshineEncoder),
|
|
789
|
+
EmptyOrPath(result.paths.moonshineUncachedDecoder), EmptyOrPath(result.paths.moonshineCachedDecoder));
|
|
790
|
+
break;
|
|
791
|
+
case SttModelKind::kMoonshineV2:
|
|
792
|
+
LOGI("DetectSttModel: paths set moonshine_v2 encoder=%s mergedDecoder=%s",
|
|
793
|
+
EmptyOrPath(result.paths.moonshineEncoder), EmptyOrPath(result.paths.moonshineMergedDecoder));
|
|
794
|
+
break;
|
|
795
|
+
case SttModelKind::kNemoCtc:
|
|
796
|
+
case SttModelKind::kWenetCtc:
|
|
797
|
+
case SttModelKind::kSenseVoice:
|
|
798
|
+
case SttModelKind::kZipformerCtc:
|
|
799
|
+
case SttModelKind::kToneCtc:
|
|
800
|
+
LOGI("DetectSttModel: paths set ctcModel=%s", EmptyOrPath(result.paths.ctcModel));
|
|
801
|
+
break;
|
|
802
|
+
case SttModelKind::kFireRedAsr:
|
|
803
|
+
LOGI("DetectSttModel: paths set fireRedEncoder=%s fireRedDecoder=%s",
|
|
804
|
+
EmptyOrPath(result.paths.fireRedEncoder), EmptyOrPath(result.paths.fireRedDecoder));
|
|
805
|
+
break;
|
|
806
|
+
case SttModelKind::kFunAsrNano:
|
|
807
|
+
LOGI("DetectSttModel: paths set funasr adaptor=%s llm=%s embedding=%s tokenizer=%s",
|
|
808
|
+
EmptyOrPath(result.paths.funasrEncoderAdaptor), EmptyOrPath(result.paths.funasrLLM),
|
|
809
|
+
EmptyOrPath(result.paths.funasrEmbedding), EmptyOrPath(result.paths.funasrTokenizer));
|
|
810
|
+
break;
|
|
811
|
+
default:
|
|
812
|
+
break;
|
|
232
813
|
}
|
|
233
|
-
|
|
234
|
-
|
|
814
|
+
LOGI("DetectSttModel: tokens=%s (required=%d)", EmptyOrPath(result.paths.tokens), (int)result.tokensRequired);
|
|
815
|
+
LOGI("DetectSttModel: detection OK for %s", modelDir.c_str());
|
|
816
|
+
result.ok = true;
|
|
817
|
+
return result;
|
|
818
|
+
}
|
|
819
|
+
|
|
820
|
+
// Test-only: used by host-side model_detect_test; not used in production (Android/iOS use DetectSttModel).
|
|
821
|
+
SttDetectResult DetectSttModelFromFileList(
|
|
822
|
+
const std::vector<model_detect::FileEntry>& files,
|
|
823
|
+
const std::string& modelDir,
|
|
824
|
+
const std::optional<bool>& preferInt8,
|
|
825
|
+
const std::optional<std::string>& modelType
|
|
826
|
+
) {
|
|
827
|
+
using namespace model_detect;
|
|
828
|
+
|
|
829
|
+
SttDetectResult result;
|
|
830
|
+
|
|
831
|
+
if (modelDir.empty()) {
|
|
832
|
+
result.error = "Model directory is empty";
|
|
833
|
+
return result;
|
|
235
834
|
}
|
|
236
835
|
|
|
237
|
-
|
|
836
|
+
SttCandidatePaths candidate = GatherSttCandidatePaths(files, modelDir, preferInt8);
|
|
837
|
+
SttPathHints hints = GetSttPathHints(modelDir);
|
|
838
|
+
ApplyQnnBinaryModel(files, modelDir, hints, candidate);
|
|
839
|
+
SttCapabilities cap = ComputeSttCapabilities(candidate, hints);
|
|
238
840
|
|
|
239
|
-
|
|
240
|
-
selected = ParseSttModelType(modelType.value());
|
|
241
|
-
if (selected == SttModelKind::kUnknown) {
|
|
242
|
-
result.error = "Unknown model type: " + modelType.value();
|
|
243
|
-
return result;
|
|
244
|
-
}
|
|
841
|
+
CollectDetectedModels(result.detectedModels, cap, hints, candidate, modelDir);
|
|
245
842
|
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
result.error =
|
|
843
|
+
result.selectedKind = ResolveSttKind(modelType, cap, hints, candidate, modelDir, result.error);
|
|
844
|
+
if (result.selectedKind == SttModelKind::kUnknown) {
|
|
845
|
+
if (IsHardwareSpecificModelDir(modelDir)) {
|
|
846
|
+
result.ok = false;
|
|
847
|
+
result.isHardwareSpecificUnsupported = true;
|
|
848
|
+
result.error = kHardwareSpecificUnsupportedMessage;
|
|
252
849
|
return result;
|
|
253
850
|
}
|
|
254
|
-
if (
|
|
255
|
-
result.error = "
|
|
256
|
-
|
|
257
|
-
}
|
|
258
|
-
if ((selected == SttModelKind::kNemoCtc || selected == SttModelKind::kWenetCtc ||
|
|
259
|
-
selected == SttModelKind::kSenseVoice || selected == SttModelKind::kZipformerCtc ||
|
|
260
|
-
selected == SttModelKind::kToneCtc) &&
|
|
261
|
-
ctcModelPath.empty()) {
|
|
262
|
-
result.error = "CTC model requested but model file not found in " + modelDir;
|
|
263
|
-
return result;
|
|
264
|
-
}
|
|
265
|
-
if (selected == SttModelKind::kWhisper && !hasWhisper) {
|
|
266
|
-
result.error = "Whisper model requested but encoder/decoder not found in " + modelDir;
|
|
267
|
-
return result;
|
|
268
|
-
}
|
|
269
|
-
if (selected == SttModelKind::kFunAsrNano && !hasFunAsrNano) {
|
|
270
|
-
result.error = "FunASR Nano model requested but required files not found in " + modelDir;
|
|
271
|
-
return result;
|
|
272
|
-
}
|
|
273
|
-
if (selected == SttModelKind::kMoonshine && !hasMoonshine) {
|
|
274
|
-
result.error = "Moonshine model requested but preprocess/encode/uncached_decode/cached_decode not found in " + modelDir;
|
|
275
|
-
return result;
|
|
276
|
-
}
|
|
277
|
-
if (selected == SttModelKind::kDolphin && !hasDolphin) {
|
|
278
|
-
result.error = "Dolphin model requested but model not found in " + modelDir;
|
|
279
|
-
return result;
|
|
280
|
-
}
|
|
281
|
-
if (selected == SttModelKind::kFireRedAsr && !hasFireRedAsr) {
|
|
282
|
-
result.error = "FireRed ASR model requested but encoder/decoder not found in " + modelDir;
|
|
283
|
-
return result;
|
|
284
|
-
}
|
|
285
|
-
if (selected == SttModelKind::kCanary && !hasCanary) {
|
|
286
|
-
result.error = "Canary model requested but encoder/decoder not found in " + modelDir;
|
|
287
|
-
return result;
|
|
288
|
-
}
|
|
289
|
-
if (selected == SttModelKind::kOmnilingual && !hasOmnilingual) {
|
|
290
|
-
result.error = "Omnilingual model requested but model not found in " + modelDir;
|
|
291
|
-
return result;
|
|
292
|
-
}
|
|
293
|
-
if (selected == SttModelKind::kMedAsr && !hasMedAsr) {
|
|
294
|
-
result.error = "MedASR model requested but model not found in " + modelDir;
|
|
295
|
-
return result;
|
|
296
|
-
}
|
|
297
|
-
if (selected == SttModelKind::kTeleSpeechCtc && !hasTeleSpeechCtc) {
|
|
298
|
-
result.error = "TeleSpeech CTC model requested but model not found in " + modelDir;
|
|
299
|
-
return result;
|
|
300
|
-
}
|
|
301
|
-
if (selected == SttModelKind::kToneCtc && !hasToneCtc) {
|
|
302
|
-
result.error = "Tone CTC model requested but path does not contain 'tone' (as a word), 't-one', or 't_one' (e.g. sherpa-onnx-streaming-t-one-*) in " + modelDir;
|
|
303
|
-
return result;
|
|
304
|
-
}
|
|
305
|
-
} else {
|
|
306
|
-
if (hasTransducer) {
|
|
307
|
-
selected = (isLikelyNemo || isLikelyTdt) ? SttModelKind::kNemoTransducer : SttModelKind::kTransducer;
|
|
308
|
-
} else if (!ctcModelPath.empty() && (isLikelyNemo || isLikelyWenetCtc || isLikelySenseVoice)) {
|
|
309
|
-
if (isLikelyNemo) {
|
|
310
|
-
selected = SttModelKind::kNemoCtc;
|
|
311
|
-
} else if (isLikelyWenetCtc) {
|
|
312
|
-
selected = SttModelKind::kWenetCtc;
|
|
313
|
-
} else {
|
|
314
|
-
selected = SttModelKind::kSenseVoice;
|
|
315
|
-
}
|
|
316
|
-
} else if (hasFunAsrNano && isLikelyFunAsrNano) {
|
|
317
|
-
selected = SttModelKind::kFunAsrNano;
|
|
318
|
-
} else if (!paraformerModelPath.empty()) {
|
|
319
|
-
selected = SttModelKind::kParaformer;
|
|
320
|
-
} else if (hasCanary) {
|
|
321
|
-
selected = SttModelKind::kCanary;
|
|
322
|
-
} else if (hasFireRedAsr) {
|
|
323
|
-
selected = SttModelKind::kFireRedAsr;
|
|
324
|
-
} else if (hasWhisper) {
|
|
325
|
-
selected = SttModelKind::kWhisper;
|
|
326
|
-
} else if (hasFunAsrNano) {
|
|
327
|
-
selected = SttModelKind::kFunAsrNano;
|
|
328
|
-
} else if (hasMoonshine && isLikelyMoonshine) {
|
|
329
|
-
selected = SttModelKind::kMoonshine;
|
|
330
|
-
} else if (hasDolphin) {
|
|
331
|
-
selected = SttModelKind::kDolphin;
|
|
332
|
-
} else if (hasOmnilingual) {
|
|
333
|
-
selected = SttModelKind::kOmnilingual;
|
|
334
|
-
} else if (hasMedAsr) {
|
|
335
|
-
selected = SttModelKind::kMedAsr;
|
|
336
|
-
} else if (hasTeleSpeechCtc) {
|
|
337
|
-
selected = SttModelKind::kTeleSpeechCtc;
|
|
338
|
-
} else if (hasToneCtc) {
|
|
339
|
-
selected = SttModelKind::kToneCtc;
|
|
340
|
-
} else if (!ctcModelPath.empty()) {
|
|
341
|
-
selected = SttModelKind::kZipformerCtc;
|
|
342
|
-
}
|
|
343
|
-
}
|
|
344
|
-
|
|
345
|
-
if (selected == SttModelKind::kUnknown) {
|
|
346
|
-
result.error = "No compatible model type detected in " + modelDir;
|
|
347
|
-
LOGE("%s", result.error.c_str());
|
|
851
|
+
if (result.error.empty())
|
|
852
|
+
result.error = "No compatible model type detected in " + modelDir;
|
|
853
|
+
result.ok = false;
|
|
348
854
|
return result;
|
|
349
855
|
}
|
|
350
856
|
|
|
351
|
-
|
|
352
|
-
result.selectedKind
|
|
353
|
-
|
|
354
|
-
|
|
355
|
-
|
|
356
|
-
result.tokensRequired = (selected != SttModelKind::kFunAsrNano);
|
|
357
|
-
|
|
358
|
-
if (selected == SttModelKind::kTransducer || selected == SttModelKind::kNemoTransducer) {
|
|
359
|
-
result.paths.encoder = encoderPath;
|
|
360
|
-
result.paths.decoder = decoderPath;
|
|
361
|
-
result.paths.joiner = joinerPath;
|
|
362
|
-
} else if (selected == SttModelKind::kParaformer) {
|
|
363
|
-
result.paths.paraformerModel = paraformerModelPath;
|
|
364
|
-
} else if (selected == SttModelKind::kNemoCtc || selected == SttModelKind::kWenetCtc ||
|
|
365
|
-
selected == SttModelKind::kSenseVoice || selected == SttModelKind::kZipformerCtc ||
|
|
366
|
-
selected == SttModelKind::kToneCtc) {
|
|
367
|
-
result.paths.ctcModel = ctcModelPath;
|
|
368
|
-
} else if (selected == SttModelKind::kWhisper) {
|
|
369
|
-
result.paths.whisperEncoder = encoderPath;
|
|
370
|
-
result.paths.whisperDecoder = decoderPath;
|
|
371
|
-
} else if (selected == SttModelKind::kFunAsrNano) {
|
|
372
|
-
result.paths.funasrEncoderAdaptor = funasrEncoderAdaptor;
|
|
373
|
-
result.paths.funasrLLM = funasrLLM;
|
|
374
|
-
result.paths.funasrEmbedding = funasrEmbedding;
|
|
375
|
-
// FunASR Nano C++ expects tokenizer directory (e.g. .../Qwen3-0.6B), not path to vocab.json
|
|
376
|
-
result.paths.funasrTokenizer = funasrTokenizerDir;
|
|
377
|
-
} else if (selected == SttModelKind::kMoonshine) {
|
|
378
|
-
result.paths.moonshinePreprocessor = moonshinePreprocessor;
|
|
379
|
-
result.paths.moonshineEncoder = moonshineEncoder;
|
|
380
|
-
result.paths.moonshineUncachedDecoder = moonshineUncachedDecoder;
|
|
381
|
-
result.paths.moonshineCachedDecoder = moonshineCachedDecoder;
|
|
382
|
-
} else if (selected == SttModelKind::kDolphin) {
|
|
383
|
-
result.paths.dolphinModel = ctcModelPath.empty() ? paraformerModelPath : ctcModelPath;
|
|
384
|
-
} else if (selected == SttModelKind::kFireRedAsr) {
|
|
385
|
-
result.paths.fireRedEncoder = encoderPath;
|
|
386
|
-
result.paths.fireRedDecoder = decoderPath;
|
|
387
|
-
} else if (selected == SttModelKind::kCanary) {
|
|
388
|
-
result.paths.canaryEncoder = encoderPath;
|
|
389
|
-
result.paths.canaryDecoder = decoderPath;
|
|
390
|
-
} else if (selected == SttModelKind::kOmnilingual) {
|
|
391
|
-
result.paths.omnilingualModel = ctcModelPath;
|
|
392
|
-
} else if (selected == SttModelKind::kMedAsr) {
|
|
393
|
-
result.paths.medasrModel = ctcModelPath;
|
|
394
|
-
} else if (selected == SttModelKind::kTeleSpeechCtc) {
|
|
395
|
-
result.paths.telespeechCtcModel = ctcModelPath.empty() ? paraformerModelPath : ctcModelPath;
|
|
396
|
-
}
|
|
397
|
-
|
|
398
|
-
if (!tokensPath.empty() && FileExists(tokensPath)) {
|
|
399
|
-
result.paths.tokens = tokensPath;
|
|
400
|
-
} else if (result.tokensRequired) {
|
|
401
|
-
result.error = "Tokens file not found in " + modelDir;
|
|
402
|
-
LOGE("%s", result.error.c_str());
|
|
403
|
-
return result;
|
|
404
|
-
}
|
|
857
|
+
result.tokensRequired = (result.selectedKind != SttModelKind::kFunAsrNano);
|
|
858
|
+
ApplyPathsForSttKind(result.selectedKind, candidate, result.paths);
|
|
859
|
+
|
|
860
|
+
result.paths.tokens = candidate.tokens;
|
|
861
|
+
result.paths.bpeVocab = candidate.bpeVocab;
|
|
405
862
|
|
|
406
|
-
|
|
407
|
-
|
|
863
|
+
auto validation = ValidateSttPaths(result.selectedKind, result.paths, modelDir);
|
|
864
|
+
if (!validation.ok) {
|
|
865
|
+
result.ok = false;
|
|
866
|
+
result.error = validation.error;
|
|
867
|
+
return result;
|
|
408
868
|
}
|
|
409
869
|
|
|
410
|
-
LOGI("DetectSttModel: detection OK for %s — tokens=%s",
|
|
411
|
-
modelDir.c_str(), result.paths.tokens.c_str());
|
|
412
870
|
result.ok = true;
|
|
413
871
|
return result;
|
|
414
872
|
}
|