react-native-sherpa-onnx 0.3.2 → 0.3.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +84 -77
- package/SherpaOnnx.podspec +79 -45
- package/android/build.gradle +8 -2
- package/android/prebuilt-download.gradle +70 -16
- package/android/prebuilt-versions.gradle +14 -6
- package/android/src/main/cpp/CMakeLists.txt +2 -0
- package/android/src/main/cpp/jni/audio/sherpa-onnx-audio-convert-jni.cpp +202 -328
- package/android/src/main/cpp/jni/model_detect/sherpa-onnx-detect-jni-common.cpp +22 -0
- package/android/src/main/cpp/jni/model_detect/sherpa-onnx-detect-jni-common.h +2 -0
- package/android/src/main/cpp/jni/model_detect/sherpa-onnx-model-detect-helper.cpp +96 -142
- package/android/src/main/cpp/jni/model_detect/sherpa-onnx-model-detect-helper.h +40 -4
- package/android/src/main/cpp/jni/model_detect/sherpa-onnx-model-detect-stt.cpp +774 -316
- package/android/src/main/cpp/jni/model_detect/sherpa-onnx-model-detect-tts.cpp +208 -122
- package/android/src/main/cpp/jni/model_detect/sherpa-onnx-model-detect.h +92 -0
- package/android/src/main/cpp/jni/model_detect/sherpa-onnx-stt-wrapper.cpp +3 -0
- package/android/src/main/cpp/jni/model_detect/sherpa-onnx-tts-wrapper.cpp +14 -2
- package/android/src/main/cpp/jni/model_detect/sherpa-onnx-validate-stt.cpp +229 -0
- package/android/src/main/cpp/jni/model_detect/sherpa-onnx-validate-stt.h +38 -0
- package/android/src/main/cpp/jni/model_detect/sherpa-onnx-validate-tts.cpp +144 -0
- package/android/src/main/cpp/jni/model_detect/sherpa-onnx-validate-tts.h +38 -0
- package/android/src/main/cpp/jni/module/sherpa-onnx-module-jni.cpp +1 -1
- package/android/src/main/java/com/sherpaonnx/SherpaOnnxModule.kt +157 -11
- package/android/src/main/java/com/sherpaonnx/SherpaOnnxPcmCapture.kt +150 -0
- package/android/src/main/java/com/sherpaonnx/SherpaOnnxSttHelper.kt +75 -24
- package/android/src/main/java/com/sherpaonnx/SherpaOnnxTtsHelper.kt +52 -1
- package/ios/SherpaOnnx+PcmLiveStream.mm +288 -0
- package/ios/SherpaOnnx+STT.mm +2 -0
- package/ios/SherpaOnnx+TTS.mm +17 -0
- package/ios/SherpaOnnx.mm +27 -3
- package/ios/SherpaOnnxAudioConvert.h +28 -0
- package/ios/SherpaOnnxAudioConvert.mm +698 -0
- package/ios/archive/sherpa-onnx-archive-helper.mm +12 -0
- package/ios/model_detect/sherpa-onnx-model-detect-helper.h +37 -3
- package/ios/model_detect/sherpa-onnx-model-detect-helper.mm +80 -45
- package/ios/model_detect/sherpa-onnx-model-detect-stt.mm +629 -267
- package/ios/model_detect/sherpa-onnx-model-detect-tts.mm +148 -56
- package/ios/model_detect/sherpa-onnx-model-detect.h +72 -0
- package/ios/model_detect/sherpa-onnx-validate-stt.h +38 -0
- package/ios/model_detect/sherpa-onnx-validate-stt.mm +229 -0
- package/ios/model_detect/sherpa-onnx-validate-tts.h +38 -0
- package/ios/model_detect/sherpa-onnx-validate-tts.mm +144 -0
- package/ios/stt/sherpa-onnx-stt-wrapper.mm +4 -0
- package/lib/module/NativeSherpaOnnx.js.map +1 -1
- package/lib/module/audio/index.js +55 -1
- package/lib/module/audio/index.js.map +1 -1
- package/lib/module/download/ModelDownloadManager.js +14 -0
- package/lib/module/download/ModelDownloadManager.js.map +1 -1
- package/lib/module/index.js +10 -0
- package/lib/module/index.js.map +1 -1
- package/lib/module/stt/streaming.js +6 -3
- package/lib/module/stt/streaming.js.map +1 -1
- package/lib/module/tts/index.js +13 -1
- package/lib/module/tts/index.js.map +1 -1
- package/lib/typescript/src/NativeSherpaOnnx.d.ts +32 -3
- package/lib/typescript/src/NativeSherpaOnnx.d.ts.map +1 -1
- package/lib/typescript/src/audio/index.d.ts +20 -1
- package/lib/typescript/src/audio/index.d.ts.map +1 -1
- package/lib/typescript/src/download/ModelDownloadManager.d.ts +2 -1
- package/lib/typescript/src/download/ModelDownloadManager.d.ts.map +1 -1
- package/lib/typescript/src/index.d.ts +10 -0
- package/lib/typescript/src/index.d.ts.map +1 -1
- package/lib/typescript/src/stt/streaming.d.ts.map +1 -1
- package/lib/typescript/src/stt/streamingTypes.d.ts +1 -1
- package/lib/typescript/src/stt/streamingTypes.d.ts.map +1 -1
- package/lib/typescript/src/tts/index.d.ts +12 -1
- package/lib/typescript/src/tts/index.d.ts.map +1 -1
- package/package.json +6 -1
- package/scripts/check-model-csvs.sh +72 -0
- package/scripts/setup-ios-framework.sh +272 -191
- package/src/NativeSherpaOnnx.ts +37 -3
- package/src/audio/index.ts +84 -1
- package/src/download/ModelDownloadManager.ts +19 -0
- package/src/index.tsx +15 -0
- package/src/stt/streaming.ts +10 -5
- package/src/stt/streamingTypes.ts +1 -1
- package/src/tts/index.ts +25 -1
- package/third_party/ffmpeg_prebuilt/ANDROID_RELEASE_TAG +1 -1
- package/third_party/libarchive_prebuilt/ANDROID_RELEASE_TAG +1 -1
- package/third_party/libarchive_prebuilt/IOS_RELEASE_TAG +1 -1
- package/third_party/sherpa-onnx-prebuilt/ANDROID_RELEASE_TAG +1 -1
- package/third_party/sherpa-onnx-prebuilt/IOS_RELEASE_TAG +1 -1
- package/ios/scripts/patch-libarchive-includes.sh +0 -61
- package/ios/scripts/setup-ios-libarchive.sh +0 -98
|
@@ -2,19 +2,79 @@
|
|
|
2
2
|
* sherpa-onnx-model-detect-stt.mm
|
|
3
3
|
*
|
|
4
4
|
* Purpose: Detects STT (speech-to-text) model type and fills SttModelPaths from a model directory.
|
|
5
|
-
*
|
|
5
|
+
* Used by the STT wrapper on iOS. Supports transducer, paraformer, whisper, moonshine, etc.
|
|
6
|
+
*
|
|
7
|
+
* --- Detection pipeline (overview) ---
|
|
8
|
+
*
|
|
9
|
+
* 1. Gather files in modelDir (recursive), then:
|
|
10
|
+
* - SttCandidatePaths: map file names to logical paths (encoder, decoder, joiner, moonshine
|
|
11
|
+
* preprocessor/encoder/mergedDecoder, paraformer/ctc model, tokens, etc.).
|
|
12
|
+
* - SttPathHints: from directory name only (isLikelyMoonshine, isLikelyNemo, ...).
|
|
13
|
+
* - SttCapabilities: which model types are *possible* given paths + hints (hasWhisper,
|
|
14
|
+
* hasMoonshineV2, hasTransducer, ...). Multiple can be true at once (e.g. same files
|
|
15
|
+
* can satisfy both Whisper and Moonshine v2).
|
|
16
|
+
*
|
|
17
|
+
* 2. detectedModels (for UI "Select model type"): built from capabilities only. Every kind
|
|
18
|
+
* with has* == true is added. So the list shows all types that could work with the files,
|
|
19
|
+
* not the single chosen type.
|
|
20
|
+
*
|
|
21
|
+
* 3. selectedKind (which type we actually use): from ResolveSttKind():
|
|
22
|
+
* - If modelType is explicit (e.g. "whisper"): use it if capabilities allow.
|
|
23
|
+
* - If modelType == "auto": Priority 1 = folder name (GetKindsFromDirName: tokens like
|
|
24
|
+
* "moonshine", "whisper" in dir name --> candidate kinds). Priority 2 = among those
|
|
25
|
+
* candidates, pick the first that CapabilitySupportsKind(). Fallback = if no name
|
|
26
|
+
* candidates, use file-only order (transducer --> moonshine v2/v1 --> CTC --> paraformer -->
|
|
27
|
+
* whisper --> ...).
|
|
28
|
+
*
|
|
29
|
+
* 4. paths: ApplyPathsForSttKind(selectedKind) copies the relevant candidate paths into
|
|
30
|
+
* SttModelPaths (encoder/decoder, moonshine encoder/mergedDecoder, etc.) for the chosen kind.
|
|
31
|
+
*
|
|
32
|
+
* Result to caller: ok, error, detectedModels (list), selectedKind (single), paths (for selectedKind).
|
|
6
33
|
*/
|
|
7
34
|
|
|
35
|
+
#import <Foundation/Foundation.h>
|
|
8
36
|
#include "sherpa-onnx-model-detect.h"
|
|
9
37
|
#include "sherpa-onnx-model-detect-helper.h"
|
|
38
|
+
#include "sherpa-onnx-validate-stt.h"
|
|
10
39
|
|
|
40
|
+
#include <algorithm>
|
|
11
41
|
#include <string>
|
|
12
42
|
|
|
43
|
+
#define LOGI(fmt, ...) NSLog(@"[SttModelDetect] " fmt, ##__VA_ARGS__)
|
|
44
|
+
|
|
13
45
|
namespace sherpaonnx {
|
|
14
46
|
namespace {
|
|
15
47
|
|
|
16
48
|
using namespace model_detect;
|
|
17
49
|
|
|
50
|
+
static const char* KindToName(SttModelKind k) {
|
|
51
|
+
switch (k) {
|
|
52
|
+
case SttModelKind::kTransducer: return "transducer";
|
|
53
|
+
case SttModelKind::kNemoTransducer: return "nemo_transducer";
|
|
54
|
+
case SttModelKind::kParaformer: return "paraformer";
|
|
55
|
+
case SttModelKind::kNemoCtc: return "nemo_ctc";
|
|
56
|
+
case SttModelKind::kWenetCtc: return "wenet_ctc";
|
|
57
|
+
case SttModelKind::kSenseVoice: return "sense_voice";
|
|
58
|
+
case SttModelKind::kZipformerCtc: return "zipformer_ctc";
|
|
59
|
+
case SttModelKind::kWhisper: return "whisper";
|
|
60
|
+
case SttModelKind::kFunAsrNano: return "funasr_nano";
|
|
61
|
+
case SttModelKind::kFireRedAsr: return "fire_red_asr";
|
|
62
|
+
case SttModelKind::kMoonshine: return "moonshine";
|
|
63
|
+
case SttModelKind::kMoonshineV2: return "moonshine_v2";
|
|
64
|
+
case SttModelKind::kDolphin: return "dolphin";
|
|
65
|
+
case SttModelKind::kCanary: return "canary";
|
|
66
|
+
case SttModelKind::kOmnilingual: return "omnilingual";
|
|
67
|
+
case SttModelKind::kMedAsr: return "medasr";
|
|
68
|
+
case SttModelKind::kTeleSpeechCtc: return "telespeech_ctc";
|
|
69
|
+
case SttModelKind::kToneCtc: return "tone_ctc";
|
|
70
|
+
default: return "unknown";
|
|
71
|
+
}
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
static const char* EmptyOrPath(const std::string& s) {
|
|
75
|
+
return s.empty() ? "(empty)" : s.c_str();
|
|
76
|
+
}
|
|
77
|
+
|
|
18
78
|
SttModelKind ParseSttModelType(const std::string& modelType) {
|
|
19
79
|
if (modelType == "transducer" || modelType == "zipformer") return SttModelKind::kTransducer;
|
|
20
80
|
if (modelType == "nemo_transducer") return SttModelKind::kNemoTransducer;
|
|
@@ -27,6 +87,7 @@ SttModelKind ParseSttModelType(const std::string& modelType) {
|
|
|
27
87
|
if (modelType == "funasr_nano") return SttModelKind::kFunAsrNano;
|
|
28
88
|
if (modelType == "fire_red_asr") return SttModelKind::kFireRedAsr;
|
|
29
89
|
if (modelType == "moonshine") return SttModelKind::kMoonshine;
|
|
90
|
+
if (modelType == "moonshine_v2") return SttModelKind::kMoonshineV2;
|
|
30
91
|
if (modelType == "dolphin") return SttModelKind::kDolphin;
|
|
31
92
|
if (modelType == "canary") return SttModelKind::kCanary;
|
|
32
93
|
if (modelType == "omnilingual") return SttModelKind::kOmnilingual;
|
|
@@ -36,324 +97,625 @@ SttModelKind ParseSttModelType(const std::string& modelType) {
|
|
|
36
97
|
return SttModelKind::kUnknown;
|
|
37
98
|
}
|
|
38
99
|
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
const
|
|
43
|
-
const
|
|
44
|
-
const
|
|
45
|
-
bool debug /* = false */
|
|
100
|
+
/** Returns true if \p cap and hints/paths support the given \p kind (required files present). */
|
|
101
|
+
static bool CapabilitySupportsKind(
|
|
102
|
+
SttModelKind kind,
|
|
103
|
+
const SttCapabilities& cap,
|
|
104
|
+
const SttPathHints& hints,
|
|
105
|
+
const SttCandidatePaths& paths
|
|
46
106
|
) {
|
|
47
|
-
|
|
107
|
+
switch (kind) {
|
|
108
|
+
case SttModelKind::kTransducer:
|
|
109
|
+
return cap.hasTransducer && !(hints.isLikelyNemo || hints.isLikelyTdt);
|
|
110
|
+
case SttModelKind::kNemoTransducer:
|
|
111
|
+
return cap.hasTransducer;
|
|
112
|
+
case SttModelKind::kParaformer:
|
|
113
|
+
return cap.hasParaformer;
|
|
114
|
+
case SttModelKind::kNemoCtc:
|
|
115
|
+
return !paths.ctcModel.empty() && hints.isLikelyNemo;
|
|
116
|
+
case SttModelKind::kWenetCtc:
|
|
117
|
+
return !paths.ctcModel.empty() && hints.isLikelyWenetCtc;
|
|
118
|
+
case SttModelKind::kSenseVoice:
|
|
119
|
+
return !paths.ctcModel.empty() && hints.isLikelySenseVoice;
|
|
120
|
+
case SttModelKind::kZipformerCtc:
|
|
121
|
+
return !paths.ctcModel.empty() && hints.isLikelyZipformer;
|
|
122
|
+
case SttModelKind::kWhisper:
|
|
123
|
+
return cap.hasWhisper;
|
|
124
|
+
case SttModelKind::kFunAsrNano:
|
|
125
|
+
return cap.hasFunAsrNano;
|
|
126
|
+
case SttModelKind::kFireRedAsr:
|
|
127
|
+
return cap.hasFireRedAsr;
|
|
128
|
+
case SttModelKind::kMoonshine:
|
|
129
|
+
return cap.hasMoonshine;
|
|
130
|
+
case SttModelKind::kMoonshineV2:
|
|
131
|
+
return cap.hasMoonshineV2;
|
|
132
|
+
case SttModelKind::kDolphin:
|
|
133
|
+
return cap.hasDolphin;
|
|
134
|
+
case SttModelKind::kCanary:
|
|
135
|
+
return cap.hasCanary;
|
|
136
|
+
case SttModelKind::kOmnilingual:
|
|
137
|
+
return cap.hasOmnilingual;
|
|
138
|
+
case SttModelKind::kMedAsr:
|
|
139
|
+
return cap.hasMedAsr;
|
|
140
|
+
case SttModelKind::kTeleSpeechCtc:
|
|
141
|
+
return cap.hasTeleSpeechCtc;
|
|
142
|
+
case SttModelKind::kToneCtc:
|
|
143
|
+
return cap.hasToneCtc;
|
|
144
|
+
default:
|
|
145
|
+
return false;
|
|
146
|
+
}
|
|
147
|
+
}
|
|
48
148
|
|
|
49
|
-
|
|
149
|
+
/**
|
|
150
|
+
* Priority 1: Collect candidate STT kinds from the model directory name (last path component).
|
|
151
|
+
* Tokens like "moonshine", "whisper", "paraformer" are matched case-insensitively. Returns
|
|
152
|
+
* candidates in a fixed priority order so that when multiple kinds match the name, file-based
|
|
153
|
+
* disambiguation picks the first supported one.
|
|
154
|
+
*/
|
|
155
|
+
static std::vector<SttModelKind> GetKindsFromDirName(const std::string& modelDir) {
|
|
156
|
+
size_t pos = modelDir.find_last_of("/\\");
|
|
157
|
+
std::string base = (pos == std::string::npos) ? modelDir : modelDir.substr(pos + 1);
|
|
158
|
+
std::string lower = ToLower(base);
|
|
159
|
+
|
|
160
|
+
std::vector<SttModelKind> out;
|
|
161
|
+
auto add = [&out](SttModelKind k) {
|
|
162
|
+
if (std::find(out.begin(), out.end(), k) == out.end())
|
|
163
|
+
out.push_back(k);
|
|
164
|
+
};
|
|
50
165
|
|
|
51
|
-
if (
|
|
52
|
-
|
|
53
|
-
|
|
166
|
+
if (lower.find("moonshine") != std::string::npos) {
|
|
167
|
+
add(SttModelKind::kMoonshineV2);
|
|
168
|
+
add(SttModelKind::kMoonshine);
|
|
54
169
|
}
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
170
|
+
if (lower.find("whisper") != std::string::npos)
|
|
171
|
+
add(SttModelKind::kWhisper);
|
|
172
|
+
if (lower.find("paraformer") != std::string::npos)
|
|
173
|
+
add(SttModelKind::kParaformer);
|
|
174
|
+
if (lower.find("nemo") != std::string::npos || lower.find("parakeet") != std::string::npos) {
|
|
175
|
+
add(SttModelKind::kNemoTransducer);
|
|
176
|
+
add(SttModelKind::kNemoCtc);
|
|
177
|
+
}
|
|
178
|
+
if (lower.find("tdt") != std::string::npos)
|
|
179
|
+
add(SttModelKind::kNemoTransducer);
|
|
180
|
+
if (lower.find("wenet") != std::string::npos)
|
|
181
|
+
add(SttModelKind::kWenetCtc);
|
|
182
|
+
if (lower.find("sense") != std::string::npos || lower.find("sensevoice") != std::string::npos)
|
|
183
|
+
add(SttModelKind::kSenseVoice);
|
|
184
|
+
if (lower.find("zipformer") != std::string::npos) {
|
|
185
|
+
add(SttModelKind::kTransducer);
|
|
186
|
+
add(SttModelKind::kZipformerCtc);
|
|
187
|
+
}
|
|
188
|
+
if (lower.find("funasr") != std::string::npos)
|
|
189
|
+
add(SttModelKind::kFunAsrNano);
|
|
190
|
+
if (lower.find("canary") != std::string::npos)
|
|
191
|
+
add(SttModelKind::kCanary);
|
|
192
|
+
if (lower.find("fire_red") != std::string::npos || lower.find("fire-red") != std::string::npos)
|
|
193
|
+
add(SttModelKind::kFireRedAsr);
|
|
194
|
+
if (lower.find("dolphin") != std::string::npos)
|
|
195
|
+
add(SttModelKind::kDolphin);
|
|
196
|
+
if (lower.find("omnilingual") != std::string::npos)
|
|
197
|
+
add(SttModelKind::kOmnilingual);
|
|
198
|
+
if (lower.find("medasr") != std::string::npos)
|
|
199
|
+
add(SttModelKind::kMedAsr);
|
|
200
|
+
if (lower.find("telespeech") != std::string::npos)
|
|
201
|
+
add(SttModelKind::kTeleSpeechCtc);
|
|
202
|
+
if (lower.find("t-one") != std::string::npos || lower.find("t_one") != std::string::npos ||
|
|
203
|
+
ContainsWord(lower, "tone"))
|
|
204
|
+
add(SttModelKind::kToneCtc);
|
|
205
|
+
if (lower.find("transducer") != std::string::npos) {
|
|
206
|
+
add(SttModelKind::kTransducer);
|
|
207
|
+
add(SttModelKind::kNemoTransducer);
|
|
59
208
|
}
|
|
60
209
|
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
std::string encoderPath = FindOnnxByAnyToken(files, {"encoder"}, preferInt8);
|
|
65
|
-
std::string decoderPath = FindOnnxByAnyToken(files, {"decoder"}, preferInt8);
|
|
66
|
-
std::string joinerPath = FindOnnxByAnyToken(files, {"joiner"}, preferInt8);
|
|
67
|
-
std::string tokensPath = FindFileEndingWith(files, "tokens.txt");
|
|
210
|
+
return out;
|
|
211
|
+
}
|
|
68
212
|
|
|
69
|
-
|
|
213
|
+
static SttCandidatePaths GatherSttCandidatePaths(
|
|
214
|
+
const std::vector<FileEntry>& files,
|
|
215
|
+
const std::string& modelDir,
|
|
216
|
+
int maxDepth,
|
|
217
|
+
const std::optional<bool>& preferInt8
|
|
218
|
+
) {
|
|
219
|
+
SttCandidatePaths p;
|
|
220
|
+
p.encoder = FindOnnxByAnyToken(files, {"encoder"}, preferInt8);
|
|
221
|
+
p.decoder = FindOnnxByAnyToken(files, {"decoder"}, preferInt8);
|
|
222
|
+
p.joiner = FindOnnxByAnyToken(files, {"joiner"}, preferInt8);
|
|
223
|
+
p.funasrEncoderAdaptor = FindOnnxByAnyToken(files, {"encoder_adaptor", "encoder-adaptor"}, preferInt8);
|
|
224
|
+
p.funasrLLM = FindOnnxByAnyToken(files, {"llm"}, preferInt8);
|
|
225
|
+
p.funasrEmbedding = FindOnnxByAnyToken(files, {"embedding"}, preferInt8);
|
|
226
|
+
{
|
|
227
|
+
std::string vocabInSubdir;
|
|
228
|
+
const std::string vocabName = "vocab.json";
|
|
229
|
+
for (const auto& entry : files) {
|
|
230
|
+
if (entry.nameLower != vocabName) continue;
|
|
231
|
+
const std::string& path = entry.path;
|
|
232
|
+
if (path.size() >= modelDir.size() && path.compare(0, modelDir.size(), modelDir) == 0 &&
|
|
233
|
+
(modelDir.empty() || path[modelDir.size()] == '/')) {
|
|
234
|
+
if (path.size() == modelDir.size() + 12 && path.compare(modelDir.size(), 12, "/vocab.json") == 0) {
|
|
235
|
+
p.funasrTokenizerDir = modelDir;
|
|
236
|
+
break;
|
|
237
|
+
}
|
|
238
|
+
if (vocabInSubdir.empty())
|
|
239
|
+
vocabInSubdir = path;
|
|
240
|
+
}
|
|
241
|
+
}
|
|
242
|
+
if (p.funasrTokenizerDir.empty() && !vocabInSubdir.empty()) {
|
|
243
|
+
size_t lastSlash = vocabInSubdir.find_last_of("/\\");
|
|
244
|
+
if (lastSlash != std::string::npos)
|
|
245
|
+
p.funasrTokenizerDir = vocabInSubdir.substr(0, lastSlash);
|
|
246
|
+
}
|
|
247
|
+
}
|
|
248
|
+
p.moonshinePreprocessor = FindOnnxByAnyToken(files, {"preprocess", "preprocessor"}, preferInt8);
|
|
249
|
+
p.moonshineEncoder = FindOnnxByAnyToken(files, {"encode", "encoder_model"}, preferInt8);
|
|
250
|
+
p.moonshineUncachedDecoder = FindOnnxByAnyToken(files, {"uncached_decode", "uncached"}, preferInt8);
|
|
251
|
+
p.moonshineCachedDecoder = FindOnnxByAnyTokenExcluding(
|
|
252
|
+
files, std::vector<std::string>{"cached_decode", "cached"}, std::vector<std::string>{"uncached"}, preferInt8);
|
|
253
|
+
p.moonshineMergedDecoder = FindOnnxByAnyToken(files, {"merged_decode", "merged_decoder", "decoder_model_merged", "merged"}, preferInt8);
|
|
254
|
+
static const std::vector<std::string> modelExcludes = {
|
|
70
255
|
"encoder", "decoder", "joiner", "vocoder", "acoustic", "embedding", "llm",
|
|
71
|
-
"encoder_adaptor", "encoder-adaptor"
|
|
256
|
+
"encoder_adaptor", "encoder-adaptor", "encoder_model", "decoder_model",
|
|
257
|
+
"merged_decoder", "decoder_model_merged", "preprocess", "encode", "uncached", "cached"
|
|
72
258
|
};
|
|
73
|
-
|
|
74
|
-
if (
|
|
75
|
-
|
|
259
|
+
p.paraformerModel = FindOnnxByAnyToken(files, {"model"}, preferInt8);
|
|
260
|
+
if (!p.paraformerModel.empty()) {
|
|
261
|
+
std::string lower = ToLower(p.paraformerModel);
|
|
262
|
+
if (lower.find("encoder_model") != std::string::npos ||
|
|
263
|
+
lower.find("decoder_model") != std::string::npos ||
|
|
264
|
+
lower.find("merged_decoder") != std::string::npos)
|
|
265
|
+
p.paraformerModel.clear();
|
|
76
266
|
}
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
267
|
+
if (p.paraformerModel.empty())
|
|
268
|
+
p.paraformerModel = FindLargestOnnxExcludingTokens(files, modelExcludes);
|
|
269
|
+
p.ctcModel = FindOnnxByAnyToken(files, {"model"}, preferInt8);
|
|
270
|
+
if (!p.ctcModel.empty()) {
|
|
271
|
+
std::string lower = ToLower(p.ctcModel);
|
|
272
|
+
if (lower.find("encoder_model") != std::string::npos ||
|
|
273
|
+
lower.find("decoder_model") != std::string::npos ||
|
|
274
|
+
lower.find("merged_decoder") != std::string::npos)
|
|
275
|
+
p.ctcModel.clear();
|
|
80
276
|
}
|
|
277
|
+
if (p.ctcModel.empty())
|
|
278
|
+
p.ctcModel = FindLargestOnnxExcludingTokens(files, modelExcludes);
|
|
279
|
+
if (!p.paraformerModel.empty() &&
|
|
280
|
+
(p.paraformerModel == p.encoder || p.paraformerModel == p.decoder || p.paraformerModel == p.joiner))
|
|
281
|
+
p.paraformerModel.clear();
|
|
282
|
+
if (!p.ctcModel.empty() &&
|
|
283
|
+
(p.ctcModel == p.encoder || p.ctcModel == p.decoder || p.ctcModel == p.joiner))
|
|
284
|
+
p.ctcModel.clear();
|
|
285
|
+
p.tokens = FindFileEndingWith(files, "tokens.txt");
|
|
286
|
+
p.bpeVocab = FindFileByName(files, "bpe.vocab");
|
|
287
|
+
p.encoderForV2 = p.encoder.empty() ? FindOnnxByAnyToken(files, {"encoder", "encoder_model"}, preferInt8) : p.encoder;
|
|
288
|
+
|
|
289
|
+
return p;
|
|
290
|
+
}
|
|
81
291
|
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
std::string
|
|
85
|
-
std::string
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
std::string
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
bool isLikelyTdt = modelDirLower.find("tdt") != std::string::npos;
|
|
108
|
-
bool isLikelyWenetCtc = modelDirLower.find("wenet") != std::string::npos;
|
|
109
|
-
bool isLikelySenseVoice = modelDirLower.find("sense") != std::string::npos ||
|
|
110
|
-
modelDirLower.find("sensevoice") != std::string::npos;
|
|
111
|
-
bool isLikelyFunAsrNano = modelDirLower.find("funasr") != std::string::npos ||
|
|
112
|
-
modelDirLower.find("funasr-nano") != std::string::npos;
|
|
113
|
-
bool isLikelyZipformer = modelDirLower.find("zipformer") != std::string::npos;
|
|
114
|
-
bool isLikelyMoonshine = modelDirLower.find("moonshine") != std::string::npos;
|
|
115
|
-
bool isLikelyDolphin = modelDirLower.find("dolphin") != std::string::npos;
|
|
116
|
-
bool isLikelyFireRedAsr = modelDirLower.find("fire_red") != std::string::npos ||
|
|
117
|
-
modelDirLower.find("fire-red") != std::string::npos;
|
|
118
|
-
bool isLikelyCanary = modelDirLower.find("canary") != std::string::npos;
|
|
119
|
-
bool isLikelyOmnilingual = modelDirLower.find("omnilingual") != std::string::npos;
|
|
120
|
-
bool isLikelyMedAsr = modelDirLower.find("medasr") != std::string::npos;
|
|
121
|
-
bool isLikelyTeleSpeech = modelDirLower.find("telespeech") != std::string::npos;
|
|
122
|
-
// Tone CTC: match "tone" only as standalone word (not e.g. "cantonese"); also accept "t-one" / "t_one"
|
|
123
|
-
bool isLikelyToneCtc = modelDirLower.find("t-one") != std::string::npos ||
|
|
124
|
-
modelDirLower.find("t_one") != std::string::npos ||
|
|
125
|
-
ContainsWord(modelDirLower, "tone");
|
|
126
|
-
|
|
127
|
-
bool hasMoonshine = !moonshinePreprocess.empty() && !moonshineUncachedDecode.empty() &&
|
|
128
|
-
!moonshineCachedDecode.empty() && !moonshineEncode.empty();
|
|
129
|
-
bool hasDolphin = isLikelyDolphin && !ctcModelPath.empty();
|
|
130
|
-
bool hasFireRedAsr = hasTransducer && isLikelyFireRedAsr;
|
|
131
|
-
bool hasCanary = hasWhisperEncoder && hasWhisperDecoder && joinerPath.empty() && isLikelyCanary;
|
|
132
|
-
bool hasOmnilingual = !ctcModelPath.empty() && isLikelyOmnilingual;
|
|
133
|
-
bool hasMedAsr = !ctcModelPath.empty() && isLikelyMedAsr;
|
|
134
|
-
bool hasTeleSpeechCtc = (!ctcModelPath.empty() || !paraformerModelPath.empty()) && isLikelyTeleSpeech;
|
|
135
|
-
bool hasToneCtc = !ctcModelPath.empty() && isLikelyToneCtc;
|
|
136
|
-
|
|
137
|
-
if (hasTransducer) {
|
|
138
|
-
if (isLikelyNemo || isLikelyTdt) {
|
|
139
|
-
result.detectedModels.push_back({"nemo_transducer", modelDir});
|
|
140
|
-
} else {
|
|
141
|
-
result.detectedModels.push_back({isLikelyZipformer ? "zipformer" : "transducer", modelDir});
|
|
142
|
-
}
|
|
143
|
-
}
|
|
292
|
+
static SttPathHints GetSttPathHints(const std::string& modelDir) {
|
|
293
|
+
SttPathHints h;
|
|
294
|
+
std::string lower = ToLower(modelDir);
|
|
295
|
+
h.isLikelyNemo = lower.find("nemo") != std::string::npos || lower.find("parakeet") != std::string::npos;
|
|
296
|
+
h.isLikelyTdt = lower.find("tdt") != std::string::npos;
|
|
297
|
+
h.isLikelyWenetCtc = lower.find("wenet") != std::string::npos;
|
|
298
|
+
h.isLikelySenseVoice = lower.find("sense") != std::string::npos || lower.find("sensevoice") != std::string::npos;
|
|
299
|
+
h.isLikelyFunAsrNano = lower.find("funasr") != std::string::npos || lower.find("funasr-nano") != std::string::npos;
|
|
300
|
+
h.isLikelyZipformer = lower.find("zipformer") != std::string::npos;
|
|
301
|
+
h.isLikelyMoonshine = lower.find("moonshine") != std::string::npos;
|
|
302
|
+
h.isLikelyDolphin = lower.find("dolphin") != std::string::npos;
|
|
303
|
+
h.isLikelyFireRedAsr = lower.find("fire_red") != std::string::npos || lower.find("fire-red") != std::string::npos;
|
|
304
|
+
h.isLikelyCanary = lower.find("canary") != std::string::npos;
|
|
305
|
+
h.isLikelyOmnilingual = lower.find("omnilingual") != std::string::npos;
|
|
306
|
+
h.isLikelyMedAsr = lower.find("medasr") != std::string::npos;
|
|
307
|
+
h.isLikelyTeleSpeech = lower.find("telespeech") != std::string::npos;
|
|
308
|
+
// tone_ctc is for T-One models only (e.g. streaming-t-one-russian). WeNetSpeech CTC (yue, wu, etc.) uses wenet_ctc per sherpa-onnx docs.
|
|
309
|
+
h.isLikelyToneCtc = lower.find("t-one") != std::string::npos || lower.find("t_one") != std::string::npos ||
|
|
310
|
+
ContainsWord(lower, "tone");
|
|
311
|
+
h.isLikelyParaformer = lower.find("paraformer") != std::string::npos;
|
|
312
|
+
h.isLikelyVad = lower.find("vad") != std::string::npos || lower.find("silero") != std::string::npos ||
|
|
313
|
+
lower.find("ten-vad") != std::string::npos;
|
|
314
|
+
h.isLikelyTdnn = lower.find("tdnn") != std::string::npos;
|
|
315
|
+
return h;
|
|
316
|
+
}
|
|
144
317
|
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
} else if (isLikelyWenetCtc) {
|
|
149
|
-
result.detectedModels.push_back({"wenet_ctc", modelDir});
|
|
150
|
-
} else if (isLikelySenseVoice) {
|
|
151
|
-
result.detectedModels.push_back({"sense_voice", modelDir});
|
|
152
|
-
} else {
|
|
153
|
-
result.detectedModels.push_back({"ctc", modelDir});
|
|
154
|
-
}
|
|
155
|
-
} else if (!paraformerModelPath.empty()) {
|
|
156
|
-
result.detectedModels.push_back({"paraformer", modelDir});
|
|
157
|
-
}
|
|
318
|
+
/** Error message when model is for unsupported hardware (RK35xx, Ascend, etc.). */
|
|
319
|
+
static const char* kHardwareSpecificUnsupportedMessage =
|
|
320
|
+
"This model is built for hardware-specific acceleration (e.g. RK35xx, Ascend, CANN) and is not supported by the React Native SDK. Use an ONNX model for CPU/GPU or a QNN-capable model on supported devices.";
|
|
158
321
|
|
|
159
|
-
|
|
160
|
-
|
|
322
|
+
/** True if model dir name indicates a hardware-specific build (e.g. RK3588, Ascend). Not runnable on generic host. QNN is supported by the SDK. */
|
|
323
|
+
static bool IsHardwareSpecificModelDir(const std::string& modelDir) {
|
|
324
|
+
std::string lower = ToLower(modelDir);
|
|
325
|
+
const char* tokens[] = {
|
|
326
|
+
"rk3588", "rk3576", "rk3568", "rk3566", "rk3562", "rknn",
|
|
327
|
+
"ascend", "cann", "910b", "910b2", "310p3"
|
|
328
|
+
};
|
|
329
|
+
for (const char* t : tokens) {
|
|
330
|
+
if (lower.find(t) != std::string::npos)
|
|
331
|
+
return true;
|
|
161
332
|
}
|
|
333
|
+
return false;
|
|
334
|
+
}
|
|
162
335
|
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
336
|
+
static SttCapabilities ComputeSttCapabilities(const SttCandidatePaths& paths, const SttPathHints& hints) {
|
|
337
|
+
SttCapabilities c;
|
|
338
|
+
c.hasTransducer = !paths.encoder.empty() && !paths.decoder.empty() && !paths.joiner.empty();
|
|
339
|
+
bool hasWhisperEnc = !paths.encoder.empty();
|
|
340
|
+
bool hasWhisperDec = !paths.decoder.empty();
|
|
341
|
+
c.hasWhisper = hasWhisperEnc && hasWhisperDec && paths.joiner.empty();
|
|
342
|
+
bool hasFunAsrTok = !paths.funasrTokenizerDir.empty();
|
|
343
|
+
c.hasFunAsrNano = !paths.funasrEncoderAdaptor.empty() && !paths.funasrLLM.empty() &&
|
|
344
|
+
!paths.funasrEmbedding.empty() && hasFunAsrTok;
|
|
345
|
+
c.hasMoonshine = !paths.moonshinePreprocessor.empty() && !paths.moonshineUncachedDecoder.empty() &&
|
|
346
|
+
!paths.moonshineCachedDecoder.empty() && !paths.moonshineEncoder.empty();
|
|
347
|
+
c.hasMoonshineV2 = !paths.moonshineMergedDecoder.empty() && !paths.encoderForV2.empty() && paths.joiner.empty();
|
|
348
|
+
c.hasParaformer = !paths.paraformerModel.empty();
|
|
349
|
+
c.hasDolphin = hints.isLikelyDolphin && !paths.ctcModel.empty();
|
|
350
|
+
// Fire Red ASR: only encoder+decoder (two files). Single-file Fire Red (e.g. fire-red-asr2-ctc) uses CTC path to avoid native crash.
|
|
351
|
+
c.hasFireRedAsr = (c.hasTransducer || (hasWhisperEnc && hasWhisperDec && paths.joiner.empty())) && hints.isLikelyFireRedAsr;
|
|
352
|
+
c.hasFireRedCtc = hints.isLikelyFireRedAsr && paths.encoder.empty() && paths.decoder.empty() &&
|
|
353
|
+
(!paths.ctcModel.empty() || !paths.paraformerModel.empty());
|
|
354
|
+
c.hasCanary = hasWhisperEnc && hasWhisperDec && paths.joiner.empty() && hints.isLikelyCanary;
|
|
355
|
+
c.hasOmnilingual = !paths.ctcModel.empty() && hints.isLikelyOmnilingual;
|
|
356
|
+
c.hasMedAsr = !paths.ctcModel.empty() && hints.isLikelyMedAsr;
|
|
357
|
+
c.hasTeleSpeechCtc = (!paths.ctcModel.empty() || !paths.paraformerModel.empty()) && hints.isLikelyTeleSpeech;
|
|
358
|
+
c.hasToneCtc = !paths.ctcModel.empty() && hints.isLikelyToneCtc;
|
|
359
|
+
return c;
|
|
360
|
+
}
|
|
361
|
+
|
|
362
|
+
static void CollectDetectedModels(
|
|
363
|
+
std::vector<DetectedModel>& out,
|
|
364
|
+
const SttCapabilities& cap,
|
|
365
|
+
const SttPathHints& hints,
|
|
366
|
+
const SttCandidatePaths& paths,
|
|
367
|
+
const std::string& modelDir
|
|
368
|
+
) {
|
|
369
|
+
if (cap.hasTransducer) {
|
|
370
|
+
out.push_back({(hints.isLikelyNemo || hints.isLikelyTdt) ? "nemo_transducer" : "transducer", modelDir});
|
|
180
371
|
}
|
|
181
|
-
if (
|
|
182
|
-
|
|
372
|
+
if (!paths.ctcModel.empty() && (hints.isLikelyNemo || hints.isLikelyWenetCtc || hints.isLikelySenseVoice || hints.isLikelyZipformer)) {
|
|
373
|
+
if (hints.isLikelyNemo) out.push_back({"nemo_ctc", modelDir});
|
|
374
|
+
else if (hints.isLikelyWenetCtc) out.push_back({"wenet_ctc", modelDir});
|
|
375
|
+
else if (hints.isLikelySenseVoice) out.push_back({"sense_voice", modelDir});
|
|
376
|
+
else out.push_back({"zipformer_ctc", modelDir});
|
|
377
|
+
} else if (!paths.paraformerModel.empty()) {
|
|
378
|
+
out.push_back({"paraformer", modelDir});
|
|
183
379
|
}
|
|
184
|
-
if (
|
|
185
|
-
|
|
380
|
+
if (cap.hasWhisper) out.push_back({"whisper", modelDir});
|
|
381
|
+
if (cap.hasFunAsrNano) out.push_back({"funasr_nano", modelDir});
|
|
382
|
+
if (cap.hasMoonshine) out.push_back({"moonshine", modelDir});
|
|
383
|
+
if (cap.hasMoonshineV2) out.push_back({"moonshine_v2", modelDir});
|
|
384
|
+
if (cap.hasDolphin) out.push_back({"dolphin", modelDir});
|
|
385
|
+
if (cap.hasFireRedAsr) out.push_back({"fire_red_asr", modelDir});
|
|
386
|
+
if (cap.hasCanary) out.push_back({"canary", modelDir});
|
|
387
|
+
if (cap.hasOmnilingual) out.push_back({"omnilingual", modelDir});
|
|
388
|
+
if (cap.hasMedAsr) out.push_back({"medasr", modelDir});
|
|
389
|
+
if (cap.hasTeleSpeechCtc) out.push_back({"telespeech_ctc", modelDir});
|
|
390
|
+
if (cap.hasToneCtc) out.push_back({"tone_ctc", modelDir});
|
|
391
|
+
}
|
|
392
|
+
|
|
393
|
+
static SttModelKind ResolveSttKind(
|
|
394
|
+
const std::optional<std::string>& modelType,
|
|
395
|
+
const SttCapabilities& cap,
|
|
396
|
+
const SttPathHints& hints,
|
|
397
|
+
const SttCandidatePaths& paths,
|
|
398
|
+
const std::string& modelDir,
|
|
399
|
+
std::string& outError
|
|
400
|
+
) {
|
|
401
|
+
outError.clear();
|
|
402
|
+
if (hints.isLikelyVad) {
|
|
403
|
+
outError = "VAD models are not yet supported by the React Native SDK.";
|
|
404
|
+
return SttModelKind::kUnknown;
|
|
186
405
|
}
|
|
187
|
-
if (
|
|
188
|
-
|
|
406
|
+
if (hints.isLikelyTdnn) {
|
|
407
|
+
outError = "TDNN (keyword/yesno) models are not yet supported by the React Native SDK.";
|
|
408
|
+
return SttModelKind::kUnknown;
|
|
189
409
|
}
|
|
190
|
-
|
|
191
|
-
SttModelKind selected = SttModelKind::kUnknown;
|
|
192
|
-
|
|
193
410
|
if (modelType.has_value() && modelType.value() != "auto") {
|
|
194
|
-
selected = ParseSttModelType(modelType.value());
|
|
411
|
+
SttModelKind selected = ParseSttModelType(modelType.value());
|
|
195
412
|
if (selected == SttModelKind::kUnknown) {
|
|
196
|
-
|
|
197
|
-
return
|
|
413
|
+
outError = "Unknown model type: " + modelType.value();
|
|
414
|
+
return SttModelKind::kUnknown;
|
|
198
415
|
}
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
return result;
|
|
416
|
+
if (selected == SttModelKind::kTransducer && !cap.hasTransducer) {
|
|
417
|
+
outError = "Transducer model requested but files not found in " + modelDir;
|
|
418
|
+
return SttModelKind::kUnknown;
|
|
203
419
|
}
|
|
204
|
-
if (selected == SttModelKind::kNemoTransducer && !hasTransducer) {
|
|
205
|
-
|
|
206
|
-
return
|
|
420
|
+
if (selected == SttModelKind::kNemoTransducer && !cap.hasTransducer) {
|
|
421
|
+
outError = "NeMo Transducer model requested but encoder/decoder/joiner not found in " + modelDir;
|
|
422
|
+
return SttModelKind::kUnknown;
|
|
207
423
|
}
|
|
208
|
-
if (selected == SttModelKind::kParaformer &&
|
|
209
|
-
|
|
210
|
-
return
|
|
424
|
+
if (selected == SttModelKind::kParaformer && paths.paraformerModel.empty()) {
|
|
425
|
+
outError = "Paraformer model requested but model file not found in " + modelDir;
|
|
426
|
+
return SttModelKind::kUnknown;
|
|
211
427
|
}
|
|
212
428
|
if ((selected == SttModelKind::kNemoCtc || selected == SttModelKind::kWenetCtc ||
|
|
213
429
|
selected == SttModelKind::kSenseVoice || selected == SttModelKind::kZipformerCtc ||
|
|
214
|
-
selected == SttModelKind::kToneCtc) &&
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
return result;
|
|
430
|
+
selected == SttModelKind::kToneCtc) && paths.ctcModel.empty()) {
|
|
431
|
+
outError = "CTC model requested but model file not found in " + modelDir;
|
|
432
|
+
return SttModelKind::kUnknown;
|
|
218
433
|
}
|
|
219
|
-
if (selected == SttModelKind::kWhisper && !hasWhisper) {
|
|
220
|
-
|
|
221
|
-
return
|
|
434
|
+
if (selected == SttModelKind::kWhisper && !cap.hasWhisper) {
|
|
435
|
+
outError = "Whisper model requested but encoder/decoder not found in " + modelDir;
|
|
436
|
+
return SttModelKind::kUnknown;
|
|
222
437
|
}
|
|
223
|
-
if (selected == SttModelKind::kFunAsrNano && !hasFunAsrNano) {
|
|
224
|
-
|
|
225
|
-
return
|
|
438
|
+
if (selected == SttModelKind::kFunAsrNano && !cap.hasFunAsrNano) {
|
|
439
|
+
outError = "FunASR Nano model requested but required files not found in " + modelDir;
|
|
440
|
+
return SttModelKind::kUnknown;
|
|
226
441
|
}
|
|
227
|
-
if (selected == SttModelKind::kMoonshine && !hasMoonshine) {
|
|
228
|
-
|
|
229
|
-
return
|
|
442
|
+
if (selected == SttModelKind::kMoonshine && !cap.hasMoonshine) {
|
|
443
|
+
outError = "Moonshine v1 model requested but preprocess/encode/uncached_decode/cached_decode not found in " + modelDir;
|
|
444
|
+
return SttModelKind::kUnknown;
|
|
230
445
|
}
|
|
231
|
-
if (selected == SttModelKind::
|
|
232
|
-
|
|
233
|
-
return
|
|
446
|
+
if (selected == SttModelKind::kMoonshineV2 && !cap.hasMoonshineV2) {
|
|
447
|
+
outError = "Moonshine v2 model requested but encoder/merged_decode not found in " + modelDir;
|
|
448
|
+
return SttModelKind::kUnknown;
|
|
234
449
|
}
|
|
235
|
-
if (selected == SttModelKind::
|
|
236
|
-
|
|
237
|
-
return
|
|
450
|
+
if (selected == SttModelKind::kDolphin && !cap.hasDolphin) {
|
|
451
|
+
outError = "Dolphin model requested but model not found in " + modelDir;
|
|
452
|
+
return SttModelKind::kUnknown;
|
|
238
453
|
}
|
|
239
|
-
if (selected == SttModelKind::
|
|
240
|
-
|
|
241
|
-
return
|
|
454
|
+
if (selected == SttModelKind::kFireRedAsr && !cap.hasFireRedAsr) {
|
|
455
|
+
outError = "FireRed ASR model requested but encoder/decoder not found in " + modelDir;
|
|
456
|
+
return SttModelKind::kUnknown;
|
|
242
457
|
}
|
|
243
|
-
if (selected == SttModelKind::
|
|
244
|
-
|
|
245
|
-
return
|
|
458
|
+
if (selected == SttModelKind::kCanary && !cap.hasCanary) {
|
|
459
|
+
outError = "Canary model requested but encoder/decoder not found in " + modelDir;
|
|
460
|
+
return SttModelKind::kUnknown;
|
|
246
461
|
}
|
|
247
|
-
if (selected == SttModelKind::
|
|
248
|
-
|
|
249
|
-
return
|
|
462
|
+
if (selected == SttModelKind::kOmnilingual && !cap.hasOmnilingual) {
|
|
463
|
+
outError = "Omnilingual model requested but model not found in " + modelDir;
|
|
464
|
+
return SttModelKind::kUnknown;
|
|
250
465
|
}
|
|
251
|
-
if (selected == SttModelKind::
|
|
252
|
-
|
|
253
|
-
return
|
|
466
|
+
if (selected == SttModelKind::kMedAsr && !cap.hasMedAsr) {
|
|
467
|
+
outError = "MedASR model requested but model not found in " + modelDir;
|
|
468
|
+
return SttModelKind::kUnknown;
|
|
254
469
|
}
|
|
255
|
-
if (selected == SttModelKind::
|
|
256
|
-
|
|
257
|
-
return
|
|
470
|
+
if (selected == SttModelKind::kTeleSpeechCtc && !cap.hasTeleSpeechCtc) {
|
|
471
|
+
outError = "TeleSpeech CTC model requested but model not found in " + modelDir;
|
|
472
|
+
return SttModelKind::kUnknown;
|
|
258
473
|
}
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
} else if (!ctcModelPath.empty() && (isLikelyNemo || isLikelyWenetCtc || isLikelySenseVoice)) {
|
|
263
|
-
if (isLikelyNemo) {
|
|
264
|
-
selected = SttModelKind::kNemoCtc;
|
|
265
|
-
} else if (isLikelyWenetCtc) {
|
|
266
|
-
selected = SttModelKind::kWenetCtc;
|
|
267
|
-
} else {
|
|
268
|
-
selected = SttModelKind::kSenseVoice;
|
|
269
|
-
}
|
|
270
|
-
} else if (hasFunAsrNano && isLikelyFunAsrNano) {
|
|
271
|
-
selected = SttModelKind::kFunAsrNano;
|
|
272
|
-
} else if (!paraformerModelPath.empty()) {
|
|
273
|
-
selected = SttModelKind::kParaformer;
|
|
274
|
-
} else if (hasCanary) {
|
|
275
|
-
selected = SttModelKind::kCanary;
|
|
276
|
-
} else if (hasFireRedAsr) {
|
|
277
|
-
selected = SttModelKind::kFireRedAsr;
|
|
278
|
-
} else if (hasWhisper) {
|
|
279
|
-
selected = SttModelKind::kWhisper;
|
|
280
|
-
} else if (hasFunAsrNano) {
|
|
281
|
-
selected = SttModelKind::kFunAsrNano;
|
|
282
|
-
} else if (hasMoonshine && isLikelyMoonshine) {
|
|
283
|
-
selected = SttModelKind::kMoonshine;
|
|
284
|
-
} else if (hasDolphin) {
|
|
285
|
-
selected = SttModelKind::kDolphin;
|
|
286
|
-
} else if (hasFireRedAsr) {
|
|
287
|
-
selected = SttModelKind::kFireRedAsr;
|
|
288
|
-
} else if (hasCanary) {
|
|
289
|
-
selected = SttModelKind::kCanary;
|
|
290
|
-
} else if (hasOmnilingual) {
|
|
291
|
-
selected = SttModelKind::kOmnilingual;
|
|
292
|
-
} else if (hasMedAsr) {
|
|
293
|
-
selected = SttModelKind::kMedAsr;
|
|
294
|
-
} else if (hasTeleSpeechCtc) {
|
|
295
|
-
selected = SttModelKind::kTeleSpeechCtc;
|
|
296
|
-
} else if (hasToneCtc) {
|
|
297
|
-
selected = SttModelKind::kToneCtc;
|
|
298
|
-
} else if (!ctcModelPath.empty()) {
|
|
299
|
-
selected = SttModelKind::kZipformerCtc;
|
|
474
|
+
if (selected == SttModelKind::kToneCtc && !cap.hasToneCtc) {
|
|
475
|
+
outError = "Tone CTC model requested but path does not contain 'tone' (as a word), 't-one', or 't_one' (e.g. sherpa-onnx-streaming-t-one-*) in " + modelDir;
|
|
476
|
+
return SttModelKind::kUnknown;
|
|
300
477
|
}
|
|
478
|
+
return selected;
|
|
301
479
|
}
|
|
302
480
|
|
|
303
|
-
|
|
304
|
-
|
|
481
|
+
// Auto: Priority 1 – resolve from folder name candidates; Priority 2 – file-based disambiguation.
|
|
482
|
+
std::vector<SttModelKind> nameCandidates = GetKindsFromDirName(modelDir);
|
|
483
|
+
if (!nameCandidates.empty()) {
|
|
484
|
+
for (SttModelKind k : nameCandidates) {
|
|
485
|
+
if (CapabilitySupportsKind(k, cap, hints, paths))
|
|
486
|
+
return k;
|
|
487
|
+
}
|
|
488
|
+
// Name hinted at a model type but no candidate had required files; fall through to file-only.
|
|
489
|
+
}
|
|
490
|
+
|
|
491
|
+
// Fallback: no name-based candidates, or none supported – use file-only detection order.
|
|
492
|
+
if (cap.hasTransducer) {
|
|
493
|
+
return (hints.isLikelyNemo || hints.isLikelyTdt) ? SttModelKind::kNemoTransducer : SttModelKind::kTransducer;
|
|
494
|
+
}
|
|
495
|
+
if (hints.isLikelyMoonshine && cap.hasMoonshineV2) return SttModelKind::kMoonshineV2;
|
|
496
|
+
if (hints.isLikelyMoonshine && cap.hasMoonshine) return SttModelKind::kMoonshine;
|
|
497
|
+
if (!paths.ctcModel.empty() && (hints.isLikelyToneCtc || hints.isLikelyNemo || hints.isLikelyWenetCtc || hints.isLikelySenseVoice)) {
|
|
498
|
+
if (hints.isLikelyToneCtc) return SttModelKind::kToneCtc;
|
|
499
|
+
if (hints.isLikelyNemo) return SttModelKind::kNemoCtc;
|
|
500
|
+
if (hints.isLikelyWenetCtc) return SttModelKind::kWenetCtc;
|
|
501
|
+
return SttModelKind::kSenseVoice;
|
|
502
|
+
}
|
|
503
|
+
if (cap.hasFunAsrNano && hints.isLikelyFunAsrNano) return SttModelKind::kFunAsrNano;
|
|
504
|
+
if (cap.hasFireRedCtc) return SttModelKind::kZipformerCtc;
|
|
505
|
+
if (!paths.paraformerModel.empty()) return SttModelKind::kParaformer;
|
|
506
|
+
if (cap.hasCanary) return SttModelKind::kCanary;
|
|
507
|
+
if (cap.hasFireRedAsr) return SttModelKind::kFireRedAsr;
|
|
508
|
+
if (cap.hasWhisper) return SttModelKind::kWhisper;
|
|
509
|
+
if (cap.hasFunAsrNano) return SttModelKind::kFunAsrNano;
|
|
510
|
+
if (cap.hasMoonshineV2) return SttModelKind::kMoonshineV2;
|
|
511
|
+
if (cap.hasDolphin) return SttModelKind::kDolphin;
|
|
512
|
+
if (cap.hasOmnilingual) return SttModelKind::kOmnilingual;
|
|
513
|
+
if (cap.hasMedAsr) return SttModelKind::kMedAsr;
|
|
514
|
+
if (cap.hasTeleSpeechCtc) return SttModelKind::kTeleSpeechCtc;
|
|
515
|
+
if (cap.hasToneCtc) return SttModelKind::kToneCtc;
|
|
516
|
+
if (!paths.ctcModel.empty()) return SttModelKind::kZipformerCtc;
|
|
517
|
+
return SttModelKind::kUnknown;
|
|
518
|
+
}
|
|
519
|
+
|
|
520
|
+
static void ApplyPathsForSttKind(SttModelKind kind, const SttCandidatePaths& candidate, SttModelPaths& resultPaths) {
|
|
521
|
+
switch (kind) {
|
|
522
|
+
case SttModelKind::kTransducer:
|
|
523
|
+
case SttModelKind::kNemoTransducer:
|
|
524
|
+
resultPaths.encoder = candidate.encoder;
|
|
525
|
+
resultPaths.decoder = candidate.decoder;
|
|
526
|
+
resultPaths.joiner = candidate.joiner;
|
|
527
|
+
break;
|
|
528
|
+
case SttModelKind::kParaformer:
|
|
529
|
+
resultPaths.paraformerModel = candidate.paraformerModel;
|
|
530
|
+
break;
|
|
531
|
+
case SttModelKind::kNemoCtc:
|
|
532
|
+
case SttModelKind::kWenetCtc:
|
|
533
|
+
case SttModelKind::kSenseVoice:
|
|
534
|
+
case SttModelKind::kZipformerCtc:
|
|
535
|
+
case SttModelKind::kToneCtc:
|
|
536
|
+
resultPaths.ctcModel = candidate.ctcModel;
|
|
537
|
+
break;
|
|
538
|
+
case SttModelKind::kWhisper:
|
|
539
|
+
resultPaths.whisperEncoder = candidate.encoder;
|
|
540
|
+
resultPaths.whisperDecoder = candidate.decoder;
|
|
541
|
+
break;
|
|
542
|
+
case SttModelKind::kFireRedAsr: {
|
|
543
|
+
std::string singleModel = candidate.paraformerModel.empty() ? candidate.ctcModel : candidate.paraformerModel;
|
|
544
|
+
resultPaths.fireRedEncoder = candidate.encoder.empty() ? singleModel : candidate.encoder;
|
|
545
|
+
resultPaths.fireRedDecoder = candidate.decoder.empty() ? singleModel : candidate.decoder;
|
|
546
|
+
break;
|
|
547
|
+
}
|
|
548
|
+
case SttModelKind::kFunAsrNano:
|
|
549
|
+
resultPaths.funasrEncoderAdaptor = candidate.funasrEncoderAdaptor;
|
|
550
|
+
resultPaths.funasrLLM = candidate.funasrLLM;
|
|
551
|
+
resultPaths.funasrEmbedding = candidate.funasrEmbedding;
|
|
552
|
+
resultPaths.funasrTokenizer = candidate.funasrTokenizerDir;
|
|
553
|
+
break;
|
|
554
|
+
case SttModelKind::kMoonshine:
|
|
555
|
+
resultPaths.moonshinePreprocessor = candidate.moonshinePreprocessor;
|
|
556
|
+
resultPaths.moonshineEncoder = candidate.moonshineEncoder;
|
|
557
|
+
resultPaths.moonshineUncachedDecoder = candidate.moonshineUncachedDecoder;
|
|
558
|
+
resultPaths.moonshineCachedDecoder = candidate.moonshineCachedDecoder;
|
|
559
|
+
break;
|
|
560
|
+
case SttModelKind::kMoonshineV2:
|
|
561
|
+
resultPaths.moonshineEncoder = candidate.encoderForV2;
|
|
562
|
+
resultPaths.moonshineMergedDecoder = candidate.moonshineMergedDecoder;
|
|
563
|
+
break;
|
|
564
|
+
case SttModelKind::kDolphin:
|
|
565
|
+
resultPaths.dolphinModel = candidate.ctcModel.empty() ? candidate.paraformerModel : candidate.ctcModel;
|
|
566
|
+
break;
|
|
567
|
+
case SttModelKind::kCanary:
|
|
568
|
+
resultPaths.canaryEncoder = candidate.encoder;
|
|
569
|
+
resultPaths.canaryDecoder = candidate.decoder;
|
|
570
|
+
break;
|
|
571
|
+
case SttModelKind::kOmnilingual:
|
|
572
|
+
resultPaths.omnilingualModel = candidate.ctcModel;
|
|
573
|
+
break;
|
|
574
|
+
case SttModelKind::kMedAsr:
|
|
575
|
+
resultPaths.medasrModel = candidate.ctcModel;
|
|
576
|
+
break;
|
|
577
|
+
case SttModelKind::kTeleSpeechCtc:
|
|
578
|
+
resultPaths.telespeechCtcModel = candidate.ctcModel.empty() ? candidate.paraformerModel : candidate.ctcModel;
|
|
579
|
+
break;
|
|
580
|
+
default:
|
|
581
|
+
break;
|
|
582
|
+
}
|
|
583
|
+
}
|
|
584
|
+
|
|
585
|
+
} // namespace
|
|
586
|
+
|
|
587
|
+
SttDetectResult DetectSttModel(
|
|
588
|
+
const std::string& modelDir,
|
|
589
|
+
const std::optional<bool>& preferInt8,
|
|
590
|
+
const std::optional<std::string>& modelType,
|
|
591
|
+
bool debug /* = false */
|
|
592
|
+
) {
|
|
593
|
+
using namespace model_detect;
|
|
594
|
+
|
|
595
|
+
SttDetectResult result;
|
|
596
|
+
|
|
597
|
+
if (modelDir.empty()) {
|
|
598
|
+
result.error = "Model directory is empty";
|
|
305
599
|
return result;
|
|
306
600
|
}
|
|
307
601
|
|
|
308
|
-
|
|
309
|
-
|
|
310
|
-
|
|
311
|
-
if (selected == SttModelKind::kTransducer || selected == SttModelKind::kNemoTransducer) {
|
|
312
|
-
result.paths.encoder = encoderPath;
|
|
313
|
-
result.paths.decoder = decoderPath;
|
|
314
|
-
result.paths.joiner = joinerPath;
|
|
315
|
-
} else if (selected == SttModelKind::kParaformer) {
|
|
316
|
-
result.paths.paraformerModel = paraformerModelPath;
|
|
317
|
-
} else if (selected == SttModelKind::kNemoCtc || selected == SttModelKind::kWenetCtc ||
|
|
318
|
-
selected == SttModelKind::kSenseVoice || selected == SttModelKind::kZipformerCtc ||
|
|
319
|
-
selected == SttModelKind::kToneCtc) {
|
|
320
|
-
result.paths.ctcModel = ctcModelPath;
|
|
321
|
-
} else if (selected == SttModelKind::kWhisper) {
|
|
322
|
-
result.paths.whisperEncoder = encoderPath;
|
|
323
|
-
result.paths.whisperDecoder = decoderPath;
|
|
324
|
-
} else if (selected == SttModelKind::kFunAsrNano) {
|
|
325
|
-
result.paths.funasrEncoderAdaptor = funasrEncoderAdaptor;
|
|
326
|
-
result.paths.funasrLLM = funasrLLM;
|
|
327
|
-
result.paths.funasrEmbedding = funasrEmbedding;
|
|
328
|
-
result.paths.funasrTokenizer = funasrTokenizerDir;
|
|
329
|
-
} else if (selected == SttModelKind::kMoonshine) {
|
|
330
|
-
result.paths.moonshinePreprocessor = moonshinePreprocess;
|
|
331
|
-
result.paths.moonshineEncoder = moonshineEncode;
|
|
332
|
-
result.paths.moonshineUncachedDecoder = moonshineUncachedDecode;
|
|
333
|
-
result.paths.moonshineCachedDecoder = moonshineCachedDecode;
|
|
334
|
-
} else if (selected == SttModelKind::kDolphin) {
|
|
335
|
-
result.paths.dolphinModel = ctcModelPath.empty() ? paraformerModelPath : ctcModelPath;
|
|
336
|
-
} else if (selected == SttModelKind::kFireRedAsr) {
|
|
337
|
-
result.paths.fireRedEncoder = encoderPath;
|
|
338
|
-
result.paths.fireRedDecoder = decoderPath;
|
|
339
|
-
} else if (selected == SttModelKind::kCanary) {
|
|
340
|
-
result.paths.canaryEncoder = encoderPath;
|
|
341
|
-
result.paths.canaryDecoder = decoderPath;
|
|
342
|
-
} else if (selected == SttModelKind::kOmnilingual) {
|
|
343
|
-
result.paths.omnilingualModel = ctcModelPath;
|
|
344
|
-
} else if (selected == SttModelKind::kMedAsr) {
|
|
345
|
-
result.paths.medasrModel = ctcModelPath;
|
|
346
|
-
} else if (selected == SttModelKind::kTeleSpeechCtc) {
|
|
347
|
-
result.paths.telespeechCtcModel = ctcModelPath.empty() ? paraformerModelPath : ctcModelPath;
|
|
602
|
+
if (!FileExists(modelDir) || !IsDirectory(modelDir)) {
|
|
603
|
+
result.error = "Model directory does not exist or is not a directory: " + modelDir;
|
|
604
|
+
return result;
|
|
348
605
|
}
|
|
349
606
|
|
|
350
|
-
|
|
351
|
-
|
|
607
|
+
const int kMaxSearchDepth = 4;
|
|
608
|
+
const std::vector<FileEntry> files = ListFilesRecursive(modelDir, kMaxSearchDepth);
|
|
609
|
+
|
|
610
|
+
SttCandidatePaths candidate = GatherSttCandidatePaths(files, modelDir, kMaxSearchDepth, preferInt8);
|
|
611
|
+
SttPathHints hints = GetSttPathHints(modelDir);
|
|
612
|
+
SttCapabilities cap = ComputeSttCapabilities(candidate, hints);
|
|
613
|
+
|
|
614
|
+
if (debug) {
|
|
615
|
+
LOGI("DetectSttModel: tokens=%s", EmptyOrPath(candidate.tokens));
|
|
616
|
+
LOGI("DetectSttModel: transducer encoder=%s decoder=%s joiner=%s",
|
|
617
|
+
EmptyOrPath(candidate.encoder), EmptyOrPath(candidate.decoder), EmptyOrPath(candidate.joiner));
|
|
618
|
+
LOGI("DetectSttModel: paraformerModel=%s ctcModel=%s tokens=%s bpeVocab=%s",
|
|
619
|
+
EmptyOrPath(candidate.paraformerModel), EmptyOrPath(candidate.ctcModel), EmptyOrPath(candidate.tokens), EmptyOrPath(candidate.bpeVocab));
|
|
620
|
+
LOGI("DetectSttModel: moonshine preprocessor=%s encoder=%s uncachedDecoder=%s cachedDecoder=%s mergedDecoder=%s",
|
|
621
|
+
EmptyOrPath(candidate.moonshinePreprocessor), EmptyOrPath(candidate.moonshineEncoder), EmptyOrPath(candidate.moonshineUncachedDecoder),
|
|
622
|
+
EmptyOrPath(candidate.moonshineCachedDecoder), EmptyOrPath(candidate.moonshineMergedDecoder));
|
|
623
|
+
LOGI("DetectSttModel: whisper encoder=%s decoder=%s (same as transducer; joiner empty => whisper)",
|
|
624
|
+
EmptyOrPath(candidate.encoder), EmptyOrPath(candidate.decoder));
|
|
625
|
+
LOGI("DetectSttModel: funasr encoderAdaptor=%s llm=%s embedding=%s tokenizerDir=%s",
|
|
626
|
+
EmptyOrPath(candidate.funasrEncoderAdaptor), EmptyOrPath(candidate.funasrLLM), EmptyOrPath(candidate.funasrEmbedding), EmptyOrPath(candidate.funasrTokenizerDir));
|
|
627
|
+
LOGI("DetectSttModel: hasTransducer=%d hasWhisper=%d hasMoonshine=%d hasMoonshineV2=%d hasParaformer=%d hasFunAsrNano=%d hasDolphin=%d hasFireRedAsr=%d hasFireRedCtc=%d hasCanary=%d hasOmnilingual=%d hasMedAsr=%d hasTeleSpeechCtc=%d hasToneCtc=%d",
|
|
628
|
+
(int)cap.hasTransducer, (int)cap.hasWhisper, (int)cap.hasMoonshine, (int)cap.hasMoonshineV2,
|
|
629
|
+
(int)cap.hasParaformer, (int)cap.hasFunAsrNano, (int)cap.hasDolphin, (int)cap.hasFireRedAsr, (int)cap.hasFireRedCtc,
|
|
630
|
+
(int)cap.hasCanary, (int)cap.hasOmnilingual, (int)cap.hasMedAsr, (int)cap.hasTeleSpeechCtc, (int)cap.hasToneCtc);
|
|
631
|
+
LOGI("DetectSttModel: hints isLikelyNemo=%d isLikelyTdt=%d isLikelyWenetCtc=%d isLikelySenseVoice=%d isLikelyFunAsrNano=%d isLikelyZipformer=%d isLikelyMoonshine=%d isLikelyDolphin=%d isLikelyFireRedAsr=%d isLikelyCanary=%d isLikelyOmnilingual=%d isLikelyMedAsr=%d isLikelyTeleSpeech=%d isLikelyToneCtc=%d isLikelyParaformer=%d isLikelyVad=%d isLikelyTdnn=%d",
|
|
632
|
+
(int)hints.isLikelyNemo, (int)hints.isLikelyTdt, (int)hints.isLikelyWenetCtc, (int)hints.isLikelySenseVoice,
|
|
633
|
+
(int)hints.isLikelyFunAsrNano, (int)hints.isLikelyZipformer, (int)hints.isLikelyMoonshine, (int)hints.isLikelyDolphin,
|
|
634
|
+
(int)hints.isLikelyFireRedAsr, (int)hints.isLikelyCanary, (int)hints.isLikelyOmnilingual, (int)hints.isLikelyMedAsr,
|
|
635
|
+
(int)hints.isLikelyTeleSpeech, (int)hints.isLikelyToneCtc, (int)hints.isLikelyParaformer, (int)hints.isLikelyVad, (int)hints.isLikelyTdnn);
|
|
636
|
+
}
|
|
637
|
+
|
|
638
|
+
CollectDetectedModels(result.detectedModels, cap, hints, candidate, modelDir);
|
|
639
|
+
|
|
640
|
+
result.selectedKind = ResolveSttKind(modelType, cap, hints, candidate, modelDir, result.error);
|
|
641
|
+
if (result.selectedKind == SttModelKind::kUnknown) {
|
|
642
|
+
if (IsHardwareSpecificModelDir(modelDir)) {
|
|
643
|
+
result.ok = false;
|
|
644
|
+
result.isHardwareSpecificUnsupported = true;
|
|
645
|
+
result.error = kHardwareSpecificUnsupportedMessage;
|
|
646
|
+
return result;
|
|
647
|
+
}
|
|
648
|
+
if (!result.error.empty()) {
|
|
649
|
+
return result;
|
|
650
|
+
}
|
|
651
|
+
result.error = "No compatible model type detected in " + modelDir;
|
|
652
|
+
return result;
|
|
653
|
+
}
|
|
654
|
+
|
|
655
|
+
LOGI("DetectSttModel: selected kind=%d (%s)", static_cast<int>(result.selectedKind), KindToName(result.selectedKind));
|
|
656
|
+
result.tokensRequired = (result.selectedKind != SttModelKind::kFunAsrNano);
|
|
657
|
+
ApplyPathsForSttKind(result.selectedKind, candidate, result.paths);
|
|
658
|
+
|
|
659
|
+
if (!candidate.tokens.empty() && FileExists(candidate.tokens)) {
|
|
660
|
+
result.paths.tokens = candidate.tokens;
|
|
352
661
|
} else if (result.tokensRequired) {
|
|
353
662
|
result.error = "Tokens file not found in " + modelDir;
|
|
354
663
|
return result;
|
|
355
664
|
}
|
|
665
|
+
if (!candidate.bpeVocab.empty() && FileExists(candidate.bpeVocab)) {
|
|
666
|
+
result.paths.bpeVocab = candidate.bpeVocab;
|
|
667
|
+
}
|
|
668
|
+
|
|
669
|
+
auto validation = ValidateSttPaths(result.selectedKind, result.paths, modelDir);
|
|
670
|
+
if (!validation.ok) {
|
|
671
|
+
result.ok = false;
|
|
672
|
+
result.error = validation.error;
|
|
673
|
+
return result;
|
|
674
|
+
}
|
|
356
675
|
|
|
676
|
+
switch (result.selectedKind) {
|
|
677
|
+
case SttModelKind::kTransducer:
|
|
678
|
+
case SttModelKind::kNemoTransducer:
|
|
679
|
+
LOGI("DetectSttModel: paths set encoder=%s decoder=%s joiner=%s",
|
|
680
|
+
EmptyOrPath(result.paths.encoder), EmptyOrPath(result.paths.decoder), EmptyOrPath(result.paths.joiner));
|
|
681
|
+
break;
|
|
682
|
+
case SttModelKind::kParaformer:
|
|
683
|
+
LOGI("DetectSttModel: paths set paraformerModel=%s", EmptyOrPath(result.paths.paraformerModel));
|
|
684
|
+
break;
|
|
685
|
+
case SttModelKind::kWhisper:
|
|
686
|
+
LOGI("DetectSttModel: paths set whisperEncoder=%s whisperDecoder=%s",
|
|
687
|
+
EmptyOrPath(result.paths.whisperEncoder), EmptyOrPath(result.paths.whisperDecoder));
|
|
688
|
+
break;
|
|
689
|
+
case SttModelKind::kMoonshine:
|
|
690
|
+
LOGI("DetectSttModel: paths set moonshine preprocessor=%s encoder=%s uncachedDecoder=%s cachedDecoder=%s",
|
|
691
|
+
EmptyOrPath(result.paths.moonshinePreprocessor), EmptyOrPath(result.paths.moonshineEncoder),
|
|
692
|
+
EmptyOrPath(result.paths.moonshineUncachedDecoder), EmptyOrPath(result.paths.moonshineCachedDecoder));
|
|
693
|
+
break;
|
|
694
|
+
case SttModelKind::kMoonshineV2:
|
|
695
|
+
LOGI("DetectSttModel: paths set moonshine_v2 encoder=%s mergedDecoder=%s",
|
|
696
|
+
EmptyOrPath(result.paths.moonshineEncoder), EmptyOrPath(result.paths.moonshineMergedDecoder));
|
|
697
|
+
break;
|
|
698
|
+
case SttModelKind::kNemoCtc:
|
|
699
|
+
case SttModelKind::kWenetCtc:
|
|
700
|
+
case SttModelKind::kSenseVoice:
|
|
701
|
+
case SttModelKind::kZipformerCtc:
|
|
702
|
+
case SttModelKind::kToneCtc:
|
|
703
|
+
LOGI("DetectSttModel: paths set ctcModel=%s", EmptyOrPath(result.paths.ctcModel));
|
|
704
|
+
break;
|
|
705
|
+
case SttModelKind::kFireRedAsr:
|
|
706
|
+
LOGI("DetectSttModel: paths set fireRedEncoder=%s fireRedDecoder=%s",
|
|
707
|
+
EmptyOrPath(result.paths.fireRedEncoder), EmptyOrPath(result.paths.fireRedDecoder));
|
|
708
|
+
break;
|
|
709
|
+
case SttModelKind::kFunAsrNano:
|
|
710
|
+
LOGI("DetectSttModel: paths set funasr adaptor=%s llm=%s embedding=%s tokenizer=%s",
|
|
711
|
+
EmptyOrPath(result.paths.funasrEncoderAdaptor), EmptyOrPath(result.paths.funasrLLM),
|
|
712
|
+
EmptyOrPath(result.paths.funasrEmbedding), EmptyOrPath(result.paths.funasrTokenizer));
|
|
713
|
+
break;
|
|
714
|
+
default:
|
|
715
|
+
break;
|
|
716
|
+
}
|
|
717
|
+
LOGI("DetectSttModel: tokens=%s (required=%d)", EmptyOrPath(result.paths.tokens), (int)result.tokensRequired);
|
|
718
|
+
LOGI("DetectSttModel: detection OK for %s", modelDir.c_str());
|
|
357
719
|
result.ok = true;
|
|
358
720
|
return result;
|
|
359
721
|
}
|