react-native-sherpa-onnx 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (83) hide show
  1. package/LICENSE +20 -0
  2. package/README.md +402 -0
  3. package/SherpaOnnx.podspec +84 -0
  4. package/android/build.gradle +193 -0
  5. package/android/src/main/AndroidManifest.xml +2 -0
  6. package/android/src/main/cpp/CMakeLists.txt +121 -0
  7. package/android/src/main/cpp/include/sherpa-onnx/c-api/c-api.h +1918 -0
  8. package/android/src/main/cpp/include/sherpa-onnx/c-api/cxx-api.h +841 -0
  9. package/android/src/main/cpp/jni/sherpa-onnx-jni.cpp +129 -0
  10. package/android/src/main/cpp/jni/sherpa-onnx-wrapper.cpp +649 -0
  11. package/android/src/main/cpp/jni/sherpa-onnx-wrapper.h +56 -0
  12. package/android/src/main/java/com/sherpaonnx/SherpaOnnxModule.kt +316 -0
  13. package/android/src/main/java/com/sherpaonnx/SherpaOnnxPackage.kt +33 -0
  14. package/ios/Frameworks/sherpa_onnx.xcframework.zip +0 -0
  15. package/ios/SherpaOnnx.h +5 -0
  16. package/ios/SherpaOnnx.mm +293 -0
  17. package/ios/SherpaOnnx.xcconfig +19 -0
  18. package/ios/include/sherpa-onnx/c-api/c-api.h +1918 -0
  19. package/ios/include/sherpa-onnx/c-api/cxx-api.h +841 -0
  20. package/ios/sherpa-onnx-wrapper.h +57 -0
  21. package/ios/sherpa-onnx-wrapper.mm +432 -0
  22. package/lib/module/NativeSherpaOnnx.js +5 -0
  23. package/lib/module/NativeSherpaOnnx.js.map +1 -0
  24. package/lib/module/diarization/index.js +54 -0
  25. package/lib/module/diarization/index.js.map +1 -0
  26. package/lib/module/enhancement/index.js +54 -0
  27. package/lib/module/enhancement/index.js.map +1 -0
  28. package/lib/module/index.js +25 -0
  29. package/lib/module/index.js.map +1 -0
  30. package/lib/module/package.json +1 -0
  31. package/lib/module/separation/index.js +54 -0
  32. package/lib/module/separation/index.js.map +1 -0
  33. package/lib/module/stt/index.js +79 -0
  34. package/lib/module/stt/index.js.map +1 -0
  35. package/lib/module/stt/types.js +4 -0
  36. package/lib/module/stt/types.js.map +1 -0
  37. package/lib/module/tts/index.js +54 -0
  38. package/lib/module/tts/index.js.map +1 -0
  39. package/lib/module/types.js +2 -0
  40. package/lib/module/types.js.map +1 -0
  41. package/lib/module/utils.js +93 -0
  42. package/lib/module/utils.js.map +1 -0
  43. package/lib/module/vad/index.js +54 -0
  44. package/lib/module/vad/index.js.map +1 -0
  45. package/lib/typescript/package.json +1 -0
  46. package/lib/typescript/src/NativeSherpaOnnx.d.ts +39 -0
  47. package/lib/typescript/src/NativeSherpaOnnx.d.ts.map +1 -0
  48. package/lib/typescript/src/diarization/index.d.ts +49 -0
  49. package/lib/typescript/src/diarization/index.d.ts.map +1 -0
  50. package/lib/typescript/src/enhancement/index.d.ts +47 -0
  51. package/lib/typescript/src/enhancement/index.d.ts.map +1 -0
  52. package/lib/typescript/src/index.d.ts +9 -0
  53. package/lib/typescript/src/index.d.ts.map +1 -0
  54. package/lib/typescript/src/separation/index.d.ts +48 -0
  55. package/lib/typescript/src/separation/index.d.ts.map +1 -0
  56. package/lib/typescript/src/stt/index.d.ts +53 -0
  57. package/lib/typescript/src/stt/index.d.ts.map +1 -0
  58. package/lib/typescript/src/stt/types.d.ts +39 -0
  59. package/lib/typescript/src/stt/types.d.ts.map +1 -0
  60. package/lib/typescript/src/tts/index.d.ts +47 -0
  61. package/lib/typescript/src/tts/index.d.ts.map +1 -0
  62. package/lib/typescript/src/types.d.ts +59 -0
  63. package/lib/typescript/src/types.d.ts.map +1 -0
  64. package/lib/typescript/src/utils.d.ts +53 -0
  65. package/lib/typescript/src/utils.d.ts.map +1 -0
  66. package/lib/typescript/src/vad/index.d.ts +48 -0
  67. package/lib/typescript/src/vad/index.d.ts.map +1 -0
  68. package/package.json +221 -0
  69. package/scripts/copy-headers.js +184 -0
  70. package/scripts/setup-assets.js +323 -0
  71. package/scripts/setup-ios-framework.sh +282 -0
  72. package/scripts/switch-registry.js +75 -0
  73. package/src/NativeSherpaOnnx.ts +44 -0
  74. package/src/diarization/index.ts +69 -0
  75. package/src/enhancement/index.ts +67 -0
  76. package/src/index.tsx +30 -0
  77. package/src/separation/index.ts +68 -0
  78. package/src/stt/index.ts +83 -0
  79. package/src/stt/types.ts +42 -0
  80. package/src/tts/index.ts +67 -0
  81. package/src/types.ts +73 -0
  82. package/src/utils.ts +97 -0
  83. package/src/vad/index.ts +70 -0
@@ -0,0 +1,649 @@
1
+ #include "sherpa-onnx-wrapper.h"
2
+ #include <android/log.h>
3
+ #include <fstream>
4
+ #include <sstream>
5
+ #include <optional>
6
+ #include <sys/stat.h>
7
+ #include <algorithm>
8
+ #include <cctype>
9
+
10
+ // Use filesystem if available (C++17), otherwise fallback
11
+ #if __cplusplus >= 201703L && __has_include(<filesystem>)
12
+ #include <filesystem>
13
+ namespace fs = std::filesystem;
14
+ #elif __has_include(<experimental/filesystem>)
15
+ #include <experimental/filesystem>
16
+ namespace fs = std::experimental::filesystem;
17
+ #else
18
+ // Fallback: use stat/opendir for older compilers
19
+ #include <sys/stat.h>
20
+ #include <dirent.h>
21
+ #endif
22
+
23
+ // sherpa-onnx headers - use cxx-api which is compatible with libsherpa-onnx-cxx-api.so
24
+ #include "sherpa-onnx/c-api/cxx-api.h"
25
+
26
+ #define LOG_TAG "SherpaOnnxWrapper"
27
+ #define LOGI(...) __android_log_print(ANDROID_LOG_INFO, LOG_TAG, __VA_ARGS__)
28
+ #define LOGE(...) __android_log_print(ANDROID_LOG_ERROR, LOG_TAG, __VA_ARGS__)
29
+
30
+ namespace sherpaonnx {
31
+
32
+ // PIMPL pattern implementation
33
+ class SherpaOnnxWrapper::Impl {
34
+ public:
35
+ bool initialized = false;
36
+ std::string modelDir;
37
+ std::optional<sherpa_onnx::cxx::OfflineRecognizer> recognizer;
38
+ };
39
+
40
+ SherpaOnnxWrapper::SherpaOnnxWrapper() : pImpl(std::make_unique<Impl>()) {
41
+ LOGI("SherpaOnnxWrapper created");
42
+ }
43
+
44
+ SherpaOnnxWrapper::~SherpaOnnxWrapper() {
45
+ release();
46
+ LOGI("SherpaOnnxWrapper destroyed");
47
+ }
48
+
49
+ bool SherpaOnnxWrapper::initialize(
50
+ const std::string& modelDir,
51
+ const std::optional<bool>& preferInt8,
52
+ const std::optional<std::string>& modelType
53
+ ) {
54
+ if (pImpl->initialized) {
55
+ release();
56
+ }
57
+
58
+ if (modelDir.empty()) {
59
+ LOGE("Model directory is empty");
60
+ return false;
61
+ }
62
+
63
+ try {
64
+ // Helper function to check if file exists
65
+ auto fileExists = [](const std::string& path) -> bool {
66
+ #if __cplusplus >= 201703L && __has_include(<filesystem>)
67
+ return std::filesystem::exists(path);
68
+ #elif __has_include(<experimental/filesystem>)
69
+ return std::experimental::filesystem::exists(path);
70
+ #else
71
+ struct stat buffer;
72
+ return (stat(path.c_str(), &buffer) == 0);
73
+ #endif
74
+ };
75
+
76
+ auto isDirectory = [](const std::string& path) -> bool {
77
+ #if __cplusplus >= 201703L && __has_include(<filesystem>)
78
+ return std::filesystem::is_directory(path);
79
+ #elif __has_include(<experimental/filesystem>)
80
+ return std::experimental::filesystem::is_directory(path);
81
+ #else
82
+ struct stat buffer;
83
+ if (stat(path.c_str(), &buffer) != 0) return false;
84
+ return S_ISDIR(buffer.st_mode);
85
+ #endif
86
+ };
87
+
88
+ // Check if model directory exists
89
+ if (!fileExists(modelDir) || !isDirectory(modelDir)) {
90
+ LOGE("Model directory does not exist or is not a directory: %s", modelDir.c_str());
91
+ return false;
92
+ }
93
+
94
+ // Setup configuration
95
+ sherpa_onnx::cxx::OfflineRecognizerConfig config;
96
+
97
+ // Set default feature config (16kHz, 80-dim for most models)
98
+ config.feat_config.sample_rate = 16000;
99
+ config.feat_config.feature_dim = 80;
100
+
101
+ // Build paths for model files
102
+ std::string encoderPath = modelDir + "/encoder.onnx";
103
+ std::string decoderPath = modelDir + "/decoder.onnx";
104
+ std::string joinerPath = modelDir + "/joiner.onnx";
105
+ std::string encoderPathInt8 = modelDir + "/encoder.int8.onnx";
106
+ std::string decoderPathInt8 = modelDir + "/decoder.int8.onnx";
107
+ std::string paraformerPathInt8 = modelDir + "/model.int8.onnx";
108
+ std::string paraformerPath = modelDir + "/model.onnx";
109
+ std::string ctcPathInt8 = modelDir + "/model.int8.onnx";
110
+ std::string ctcPath = modelDir + "/model.onnx";
111
+ std::string tokensPath = modelDir + "/tokens.txt";
112
+
113
+ // FunASR Nano paths
114
+ std::string funasrEncoderAdaptor = modelDir + "/encoder_adaptor.onnx";
115
+ std::string funasrEncoderAdaptorInt8 = modelDir + "/encoder_adaptor.int8.onnx";
116
+ std::string funasrLLM = modelDir + "/llm.onnx";
117
+ std::string funasrLLMInt8 = modelDir + "/llm.int8.onnx";
118
+ std::string funasrEmbedding = modelDir + "/embedding.onnx";
119
+ std::string funasrEmbeddingInt8 = modelDir + "/embedding.int8.onnx";
120
+
121
+ // Helper function to find FunASR Nano tokenizer directory
122
+ // Looks in main directory and subdirectories with "Qwen3" in name
123
+ auto findFunAsrTokenizer = [&fileExists, &modelDir]() -> std::string {
124
+ // First check if vocab.json exists directly in model directory
125
+ std::string vocabInMain = modelDir + "/vocab.json";
126
+ if (fileExists(vocabInMain)) {
127
+ return modelDir; // Tokenizer files are in main directory
128
+ }
129
+
130
+ // Search for subdirectories with "Qwen3" in name
131
+ try {
132
+ for (const auto& entry : fs::directory_iterator(modelDir)) {
133
+ if (entry.is_directory()) {
134
+ std::string dirName = entry.path().filename().string();
135
+ // Check if directory name contains "Qwen3" (case-insensitive check)
136
+ std::string dirNameLower = dirName;
137
+ std::transform(dirNameLower.begin(), dirNameLower.end(), dirNameLower.begin(), ::tolower);
138
+ if (dirNameLower.find("qwen3") != std::string::npos) {
139
+ std::string vocabPath = entry.path().string() + "/vocab.json";
140
+ if (fileExists(vocabPath)) {
141
+ return entry.path().string();
142
+ }
143
+ }
144
+ }
145
+ }
146
+ } catch (const std::exception& e) {
147
+ // Error accessing directory - will return empty string
148
+ }
149
+
150
+ // Fallback: try common name
151
+ std::string commonPath = modelDir + "/Qwen3-0.6B";
152
+ if (fileExists(commonPath + "/vocab.json")) {
153
+ return commonPath;
154
+ }
155
+
156
+ return ""; // Not found
157
+ };
158
+
159
+ std::string funasrTokenizer = findFunAsrTokenizer();
160
+
161
+ // Tokens file is required for most models, but Whisper doesn't use it
162
+ // We'll check for it conditionally based on model type
163
+ bool tokensRequired = true;
164
+
165
+ // Configure based on model type
166
+ // Check for Paraformer model based on preferInt8 preference
167
+ std::string paraformerModelPath;
168
+ if (preferInt8.has_value()) {
169
+ if (preferInt8.value()) {
170
+ // Prefer int8 models
171
+ if (fileExists(paraformerPathInt8)) {
172
+ paraformerModelPath = paraformerPathInt8;
173
+ } else if (fileExists(paraformerPath)) {
174
+ paraformerModelPath = paraformerPath;
175
+ }
176
+ } else {
177
+ // Prefer regular models
178
+ if (fileExists(paraformerPath)) {
179
+ paraformerModelPath = paraformerPath;
180
+ } else if (fileExists(paraformerPathInt8)) {
181
+ paraformerModelPath = paraformerPathInt8;
182
+ }
183
+ }
184
+ } else {
185
+ // Default: try int8 first, then regular
186
+ if (fileExists(paraformerPathInt8)) {
187
+ paraformerModelPath = paraformerPathInt8;
188
+ } else if (fileExists(paraformerPath)) {
189
+ paraformerModelPath = paraformerPath;
190
+ }
191
+ }
192
+
193
+ // Check for CTC model (NeMo CTC) - similar structure to Paraformer
194
+ std::string ctcModelPath;
195
+ if (preferInt8.has_value()) {
196
+ if (preferInt8.value()) {
197
+ // Prefer int8 models
198
+ if (fileExists(ctcPathInt8)) {
199
+ ctcModelPath = ctcPathInt8;
200
+ } else if (fileExists(ctcPath)) {
201
+ ctcModelPath = ctcPath;
202
+ }
203
+ } else {
204
+ // Prefer regular models
205
+ if (fileExists(ctcPath)) {
206
+ ctcModelPath = ctcPath;
207
+ } else if (fileExists(ctcPathInt8)) {
208
+ ctcModelPath = ctcPathInt8;
209
+ }
210
+ }
211
+ } else {
212
+ // Default: try int8 first, then regular
213
+ if (fileExists(ctcPathInt8)) {
214
+ ctcModelPath = ctcPathInt8;
215
+ } else if (fileExists(ctcPath)) {
216
+ ctcModelPath = ctcPath;
217
+ }
218
+ }
219
+
220
+ // Determine model type: use explicit type if provided, otherwise auto-detect
221
+ bool hasTransducer = fileExists(encoderPath) &&
222
+ fileExists(decoderPath) &&
223
+ fileExists(joinerPath);
224
+
225
+ // Check for Whisper model (encoder + decoder, but no joiner)
226
+ // Whisper can have tokens.txt but it's optional
227
+ bool hasWhisperEncoder = fileExists(encoderPath) || fileExists(encoderPathInt8);
228
+ bool hasWhisperDecoder = fileExists(decoderPath) || fileExists(decoderPathInt8);
229
+ bool hasWhisper = hasWhisperEncoder && hasWhisperDecoder && !fileExists(joinerPath);
230
+
231
+ // Check for FunASR Nano model (encoder_adaptor, llm, embedding, tokenizer directory)
232
+ // Note: funasrTokenizer is already found by findFunAsrTokenizer() above
233
+ bool hasFunAsrEncoderAdaptor = fileExists(funasrEncoderAdaptor) || fileExists(funasrEncoderAdaptorInt8);
234
+ bool hasFunAsrLLM = fileExists(funasrLLM) || fileExists(funasrLLMInt8);
235
+ bool hasFunAsrEmbedding = fileExists(funasrEmbedding) || fileExists(funasrEmbeddingInt8);
236
+ bool hasFunAsrTokenizer = !funasrTokenizer.empty() && fileExists(funasrTokenizer + "/vocab.json");
237
+ bool hasFunAsrNano = hasFunAsrEncoderAdaptor && hasFunAsrLLM && hasFunAsrEmbedding && hasFunAsrTokenizer;
238
+
239
+ // Check if directory name suggests NeMo CTC model (contains "nemo", "parakeet")
240
+ bool isLikelyNemoCtc = modelDir.find("nemo") != std::string::npos ||
241
+ modelDir.find("parakeet") != std::string::npos;
242
+
243
+ // Check if directory name suggests WeNet CTC model (contains "wenet")
244
+ bool isLikelyWenetCtc = modelDir.find("wenet") != std::string::npos;
245
+
246
+ // Check if directory name suggests SenseVoice model (contains "sense" or "sensevoice")
247
+ bool isLikelySenseVoice = modelDir.find("sense") != std::string::npos ||
248
+ modelDir.find("sensevoice") != std::string::npos;
249
+
250
+ // Check if directory name suggests FunASR Nano model (contains "funasr" or "funasr-nano")
251
+ bool isLikelyFunAsrNano = modelDir.find("funasr") != std::string::npos ||
252
+ modelDir.find("funasr-nano") != std::string::npos;
253
+
254
+ // Check if directory name suggests Whisper model
255
+ bool isLikelyWhisper = modelDir.find("whisper") != std::string::npos;
256
+
257
+ bool modelConfigured = false;
258
+
259
+ // Use explicit model type if provided
260
+ if (modelType.has_value()) {
261
+ std::string type = modelType.value();
262
+ if (type == "transducer" && hasTransducer) {
263
+ LOGI("Using explicit Transducer model type");
264
+ config.model_config.transducer.encoder = encoderPath;
265
+ config.model_config.transducer.decoder = decoderPath;
266
+ config.model_config.transducer.joiner = joinerPath;
267
+ modelConfigured = true;
268
+ } else if (type == "paraformer" && !paraformerModelPath.empty()) {
269
+ LOGI("Using explicit Paraformer model type: %s", paraformerModelPath.c_str());
270
+ config.model_config.paraformer.model = paraformerModelPath;
271
+ modelConfigured = true;
272
+ } else if (type == "nemo_ctc" && !ctcModelPath.empty()) {
273
+ LOGI("Using explicit NeMo CTC model type: %s", ctcModelPath.c_str());
274
+ config.model_config.nemo_ctc.model = ctcModelPath;
275
+ modelConfigured = true;
276
+ } else if (type == "wenet_ctc" && !ctcModelPath.empty()) {
277
+ LOGI("Using explicit WeNet CTC model type: %s", ctcModelPath.c_str());
278
+ config.model_config.wenet_ctc.model = ctcModelPath;
279
+ modelConfigured = true;
280
+ } else if (type == "sense_voice" && !ctcModelPath.empty()) {
281
+ LOGI("Using explicit SenseVoice model type: %s", ctcModelPath.c_str());
282
+ config.model_config.sense_voice.model = ctcModelPath;
283
+ config.model_config.sense_voice.language = "auto"; // Default to auto language detection
284
+ config.model_config.sense_voice.use_itn = false; // Default to no ITN
285
+ modelConfigured = true;
286
+ } else if (type == "funasr_nano" && hasFunAsrNano) {
287
+ LOGI("Using explicit FunASR Nano model type");
288
+ // FunASR Nano uses encoder_adaptor, llm, embedding, and tokenizer directory
289
+ std::string encoderAdaptorPath = fileExists(funasrEncoderAdaptorInt8) ? funasrEncoderAdaptorInt8 : funasrEncoderAdaptor;
290
+ std::string llmPath = fileExists(funasrLLMInt8) ? funasrLLMInt8 : funasrLLM;
291
+ std::string embeddingPath = fileExists(funasrEmbeddingInt8) ? funasrEmbeddingInt8 : funasrEmbedding;
292
+ config.model_config.funasr_nano.encoder_adaptor = encoderAdaptorPath;
293
+ config.model_config.funasr_nano.llm = llmPath;
294
+ config.model_config.funasr_nano.embedding = embeddingPath;
295
+ config.model_config.funasr_nano.tokenizer = funasrTokenizer;
296
+ // Use default values for prompts and generation parameters
297
+ tokensRequired = false; // FunASR Nano doesn't use tokens.txt
298
+ modelConfigured = true;
299
+ } else if (type == "whisper" && hasWhisper) {
300
+ LOGI("Using explicit Whisper model type");
301
+ // Whisper uses encoder and decoder, prefer int8 if available
302
+ std::string whisperEncoder = fileExists(encoderPathInt8) ? encoderPathInt8 : encoderPath;
303
+ std::string whisperDecoder = fileExists(decoderPathInt8) ? decoderPathInt8 : decoderPath;
304
+ config.model_config.whisper.encoder = whisperEncoder;
305
+ config.model_config.whisper.decoder = whisperDecoder;
306
+ config.model_config.whisper.language = "en"; // Default to English
307
+ config.model_config.whisper.task = "transcribe"; // Default task
308
+ // Whisper requires tokens.txt - set it if it exists
309
+ tokensRequired = true;
310
+ if (fileExists(tokensPath)) {
311
+ config.model_config.tokens = tokensPath;
312
+ LOGI("Using tokens file for Whisper: %s", tokensPath.c_str());
313
+ } else {
314
+ LOGE("Tokens file not found for Whisper model: %s", tokensPath.c_str());
315
+ return false;
316
+ }
317
+ modelConfigured = true;
318
+ } else {
319
+ LOGE("Explicit model type '%s' specified but required files not found", type.c_str());
320
+ return false;
321
+ }
322
+ }
323
+
324
+ // Auto-detect if no explicit type or auto was specified
325
+ if (!modelConfigured) {
326
+ if (hasTransducer) {
327
+ // Zipformer/Transducer model (has encoder, decoder, AND joiner)
328
+ LOGI("Auto-detected Transducer model: encoder=%s, decoder=%s, joiner=%s",
329
+ encoderPath.c_str(), decoderPath.c_str(), joinerPath.c_str());
330
+ config.model_config.transducer.encoder = encoderPath;
331
+ config.model_config.transducer.decoder = decoderPath;
332
+ config.model_config.transducer.joiner = joinerPath;
333
+ modelConfigured = true;
334
+ } else if (hasFunAsrNano && isLikelyFunAsrNano) {
335
+ // FunASR Nano model (has encoder_adaptor, llm, embedding, and tokenizer)
336
+ std::string encoderAdaptorPath = fileExists(funasrEncoderAdaptorInt8) ? funasrEncoderAdaptorInt8 : funasrEncoderAdaptor;
337
+ std::string llmPath = fileExists(funasrLLMInt8) ? funasrLLMInt8 : funasrLLM;
338
+ std::string embeddingPath = fileExists(funasrEmbeddingInt8) ? funasrEmbeddingInt8 : funasrEmbedding;
339
+ LOGI("Auto-detected FunASR Nano model: encoder_adaptor=%s, llm=%s, embedding=%s, tokenizer=%s",
340
+ encoderAdaptorPath.c_str(), llmPath.c_str(), embeddingPath.c_str(), funasrTokenizer.c_str());
341
+ config.model_config.funasr_nano.encoder_adaptor = encoderAdaptorPath;
342
+ config.model_config.funasr_nano.llm = llmPath;
343
+ config.model_config.funasr_nano.embedding = embeddingPath;
344
+ config.model_config.funasr_nano.tokenizer = funasrTokenizer;
345
+ tokensRequired = false; // FunASR Nano doesn't use tokens.txt
346
+ modelConfigured = true;
347
+ } else if (hasWhisper && isLikelyWhisper) {
348
+ // Whisper model (encoder + decoder, but no joiner, and directory name suggests Whisper)
349
+ std::string whisperEncoder = fileExists(encoderPathInt8) ? encoderPathInt8 : encoderPath;
350
+ std::string whisperDecoder = fileExists(decoderPathInt8) ? decoderPathInt8 : decoderPath;
351
+ LOGI("Auto-detected Whisper model: encoder=%s, decoder=%s",
352
+ whisperEncoder.c_str(), whisperDecoder.c_str());
353
+ config.model_config.whisper.encoder = whisperEncoder;
354
+ config.model_config.whisper.decoder = whisperDecoder;
355
+ config.model_config.whisper.language = "en"; // Default to English
356
+ config.model_config.whisper.task = "transcribe"; // Default task
357
+ // Whisper requires tokens.txt - set it if it exists
358
+ tokensRequired = true; // Whisper requires tokens.txt
359
+ if (fileExists(tokensPath)) {
360
+ config.model_config.tokens = tokensPath;
361
+ LOGI("Using tokens file for Whisper: %s", tokensPath.c_str());
362
+ } else {
363
+ LOGE("Tokens file not found for Whisper model: %s", tokensPath.c_str());
364
+ return false;
365
+ }
366
+ modelConfigured = true;
367
+ } else if (!ctcModelPath.empty() && isLikelySenseVoice) {
368
+ // SenseVoice model (model.onnx exists and directory name suggests SenseVoice)
369
+ LOGI("Auto-detected SenseVoice model: %s (detected by directory name)", ctcModelPath.c_str());
370
+ config.model_config.sense_voice.model = ctcModelPath;
371
+ config.model_config.sense_voice.language = "auto"; // Default to auto language detection
372
+ config.model_config.sense_voice.use_itn = false; // Default to no ITN
373
+ modelConfigured = true;
374
+ } else if (!ctcModelPath.empty() && isLikelyWenetCtc) {
375
+ // WeNet CTC model (model.onnx exists and directory name suggests WeNet)
376
+ LOGI("Auto-detected WeNet CTC model: %s (detected by directory name)", ctcModelPath.c_str());
377
+ config.model_config.wenet_ctc.model = ctcModelPath;
378
+ modelConfigured = true;
379
+ } else if (!ctcModelPath.empty() && isLikelyNemoCtc) {
380
+ // NeMo CTC model (model.onnx exists and directory name suggests NeMo CTC)
381
+ LOGI("Auto-detected NeMo CTC model: %s (detected by directory name)", ctcModelPath.c_str());
382
+ config.model_config.nemo_ctc.model = ctcModelPath;
383
+ modelConfigured = true;
384
+ } else if (!paraformerModelPath.empty()) {
385
+ // Paraformer model (has model.onnx, and directory name doesn't suggest CTC)
386
+ LOGI("Auto-detected Paraformer model: %s", paraformerModelPath.c_str());
387
+ config.model_config.paraformer.model = paraformerModelPath;
388
+ modelConfigured = true;
389
+ } else if (!ctcModelPath.empty() && isLikelyWenetCtc) {
390
+ // Fallback: WeNet CTC model (model.onnx exists, directory name suggests WeNet)
391
+ LOGI("Auto-detected WeNet CTC model: %s (fallback detection)", ctcModelPath.c_str());
392
+ config.model_config.wenet_ctc.model = ctcModelPath;
393
+ modelConfigured = true;
394
+ } else {
395
+ // Fallback: Set all found files and let sherpa-onnx detect the model type from metadata
396
+ // This increases the chance of success for unknown model types
397
+ LOGI("No specific model type detected. Setting all found files and letting sherpa-onnx auto-detect from metadata");
398
+
399
+ bool anyFileSet = false;
400
+
401
+ // Set transducer files if present
402
+ if (hasTransducer) {
403
+ config.model_config.transducer.encoder = encoderPath;
404
+ config.model_config.transducer.decoder = decoderPath;
405
+ config.model_config.transducer.joiner = joinerPath;
406
+ anyFileSet = true;
407
+ LOGI("Set transducer files: encoder=%s, decoder=%s, joiner=%s",
408
+ encoderPath.c_str(), decoderPath.c_str(), joinerPath.c_str());
409
+ }
410
+
411
+ // Set Whisper files if present (encoder + decoder, no joiner)
412
+ if (hasWhisper) {
413
+ std::string whisperEncoder = fileExists(encoderPathInt8) ? encoderPathInt8 : encoderPath;
414
+ std::string whisperDecoder = fileExists(decoderPathInt8) ? decoderPathInt8 : decoderPath;
415
+ config.model_config.whisper.encoder = whisperEncoder;
416
+ config.model_config.whisper.decoder = whisperDecoder;
417
+ config.model_config.whisper.language = "en"; // Default
418
+ config.model_config.whisper.task = "transcribe"; // Default
419
+ anyFileSet = true;
420
+ LOGI("Set Whisper files: encoder=%s, decoder=%s",
421
+ whisperEncoder.c_str(), whisperDecoder.c_str());
422
+ }
423
+
424
+ // Set Paraformer model if present
425
+ if (!paraformerModelPath.empty()) {
426
+ config.model_config.paraformer.model = paraformerModelPath;
427
+ anyFileSet = true;
428
+ LOGI("Set Paraformer model: %s", paraformerModelPath.c_str());
429
+ }
430
+
431
+ // Set FunASR Nano files if present (re-check tokenizer in fallback mode)
432
+ if (hasFunAsrEncoderAdaptor && hasFunAsrLLM && hasFunAsrEmbedding) {
433
+ // Try to find tokenizer if not already found
434
+ std::string tokenizerPath = funasrTokenizer;
435
+ if (tokenizerPath.empty()) {
436
+ // Re-run tokenizer search in fallback mode
437
+ std::string vocabInMain = modelDir + "/vocab.json";
438
+ if (fileExists(vocabInMain)) {
439
+ tokenizerPath = modelDir;
440
+ } else {
441
+ // Search for subdirectories with "Qwen3" in name
442
+ try {
443
+ for (const auto& entry : fs::directory_iterator(modelDir)) {
444
+ if (entry.is_directory()) {
445
+ std::string dirName = entry.path().filename().string();
446
+ std::string dirNameLower = dirName;
447
+ std::transform(dirNameLower.begin(), dirNameLower.end(), dirNameLower.begin(), ::tolower);
448
+ if (dirNameLower.find("qwen3") != std::string::npos) {
449
+ std::string vocabPath = entry.path().string() + "/vocab.json";
450
+ if (fileExists(vocabPath)) {
451
+ tokenizerPath = entry.path().string();
452
+ break;
453
+ }
454
+ }
455
+ }
456
+ }
457
+ } catch (const std::exception& e) {
458
+ LOGE("Error searching for FunASR tokenizer in fallback: %s", e.what());
459
+ }
460
+ }
461
+ }
462
+
463
+ if (!tokenizerPath.empty() && fileExists(tokenizerPath + "/vocab.json")) {
464
+ std::string encoderAdaptorPath = fileExists(funasrEncoderAdaptorInt8) ? funasrEncoderAdaptorInt8 : funasrEncoderAdaptor;
465
+ std::string llmPath = fileExists(funasrLLMInt8) ? funasrLLMInt8 : funasrLLM;
466
+ std::string embeddingPath = fileExists(funasrEmbeddingInt8) ? funasrEmbeddingInt8 : funasrEmbedding;
467
+ config.model_config.funasr_nano.encoder_adaptor = encoderAdaptorPath;
468
+ config.model_config.funasr_nano.llm = llmPath;
469
+ config.model_config.funasr_nano.embedding = embeddingPath;
470
+ config.model_config.funasr_nano.tokenizer = tokenizerPath;
471
+ anyFileSet = true;
472
+ LOGI("Set FunASR Nano files (fallback): encoder_adaptor=%s, llm=%s, embedding=%s, tokenizer=%s",
473
+ encoderAdaptorPath.c_str(), llmPath.c_str(), embeddingPath.c_str(), tokenizerPath.c_str());
474
+ }
475
+ }
476
+
477
+ // Set CTC models if present (try all CTC types)
478
+ if (!ctcModelPath.empty()) {
479
+ // Set all CTC model types - sherpa-onnx will use the correct one based on metadata
480
+ config.model_config.nemo_ctc.model = ctcModelPath;
481
+ config.model_config.wenet_ctc.model = ctcModelPath;
482
+ // Also set SenseVoice if directory name suggests it
483
+ if (isLikelySenseVoice) {
484
+ config.model_config.sense_voice.model = ctcModelPath;
485
+ config.model_config.sense_voice.language = "auto";
486
+ config.model_config.sense_voice.use_itn = false;
487
+ LOGI("Set SenseVoice model: %s", ctcModelPath.c_str());
488
+ }
489
+ // Note: We could also set tdnn, zipformer_ctc, telespeech_ctc here
490
+ // but those are less common, so we'll let sherpa-onnx handle them
491
+ anyFileSet = true;
492
+ LOGI("Set CTC model files: %s (will be detected as NeMo CTC, WeNet CTC, SenseVoice, or other CTC type from metadata)",
493
+ ctcModelPath.c_str());
494
+ }
495
+
496
+ if (anyFileSet) {
497
+ modelConfigured = true;
498
+ LOGI("Fallback: All found files set. sherpa-onnx will detect model type from metadata.");
499
+ }
500
+ }
501
+ }
502
+
503
+ // Set tokens if required or if available (for fallback mode)
504
+ if (tokensRequired) {
505
+ if (!fileExists(tokensPath)) {
506
+ LOGE("Tokens file not found: %s", tokensPath.c_str());
507
+ return false;
508
+ }
509
+ config.model_config.tokens = tokensPath;
510
+ LOGI("Using tokens file: %s", tokensPath.c_str());
511
+ } else if (modelConfigured && fileExists(tokensPath)) {
512
+ // In fallback mode, set tokens.txt if available (many models need it)
513
+ config.model_config.tokens = tokensPath;
514
+ LOGI("Using tokens file (fallback mode): %s", tokensPath.c_str());
515
+ }
516
+
517
+ if (!modelConfigured) {
518
+ LOGE("No valid model files found in directory: %s", modelDir.c_str());
519
+ LOGE("Checked paths:");
520
+ LOGE(" Paraformer (int8): %s (exists: %s)", paraformerPathInt8.c_str(), fileExists(paraformerPathInt8) ? "yes" : "no");
521
+ LOGE(" Paraformer: %s (exists: %s)", paraformerPath.c_str(), fileExists(paraformerPath) ? "yes" : "no");
522
+ LOGE(" CTC (int8): %s (exists: %s)", ctcPathInt8.c_str(), fileExists(ctcPathInt8) ? "yes" : "no");
523
+ LOGE(" CTC: %s (exists: %s)", ctcPath.c_str(), fileExists(ctcPath) ? "yes" : "no");
524
+ LOGE(" Encoder: %s (exists: %s)", encoderPath.c_str(), fileExists(encoderPath) ? "yes" : "no");
525
+ LOGE(" Encoder (int8): %s (exists: %s)", encoderPathInt8.c_str(), fileExists(encoderPathInt8) ? "yes" : "no");
526
+ LOGE(" Decoder: %s (exists: %s)", decoderPath.c_str(), fileExists(decoderPath) ? "yes" : "no");
527
+ LOGE(" Decoder (int8): %s (exists: %s)", decoderPathInt8.c_str(), fileExists(decoderPathInt8) ? "yes" : "no");
528
+ LOGE(" Joiner: %s (exists: %s)", joinerPath.c_str(), fileExists(joinerPath) ? "yes" : "no");
529
+ LOGE("Expected transducer model (encoder.onnx, decoder.onnx, joiner.onnx), whisper model (encoder.onnx, decoder.onnx), paraformer model (model.onnx or model.int8.onnx), NeMo CTC model (model.onnx or model.int8.onnx), WeNet CTC model (model.onnx or model.int8.onnx), SenseVoice model (model.onnx or model.int8.onnx), or FunASR Nano model (encoder_adaptor.onnx, llm.onnx, embedding.onnx, tokenizer directory)");
530
+ return false;
531
+ }
532
+
533
+ // Set common configuration
534
+ config.decoding_method = "greedy_search";
535
+ config.model_config.num_threads = 4;
536
+ config.model_config.provider = "cpu";
537
+
538
+ // Create recognizer
539
+ // Log configuration details
540
+ bool isWhisperModel = !config.model_config.whisper.encoder.empty() && !config.model_config.whisper.decoder.empty();
541
+ if (isWhisperModel) {
542
+ std::string tokensInfo = config.model_config.tokens.empty() ? "none" : config.model_config.tokens;
543
+ LOGI("Creating OfflineRecognizer with Whisper config: encoder=%s, decoder=%s, language=%s, task=%s, tokens=%s, num_threads=%d, provider=%s",
544
+ config.model_config.whisper.encoder.c_str(),
545
+ config.model_config.whisper.decoder.c_str(),
546
+ config.model_config.whisper.language.c_str(),
547
+ config.model_config.whisper.task.c_str(),
548
+ tokensInfo.c_str(),
549
+ config.model_config.num_threads,
550
+ config.model_config.provider.c_str());
551
+ } else {
552
+ LOGI("Creating OfflineRecognizer with config: tokens=%s, num_threads=%d, provider=%s",
553
+ config.model_config.tokens.c_str(), config.model_config.num_threads, config.model_config.provider.c_str());
554
+ }
555
+ try {
556
+ auto recognizer = sherpa_onnx::cxx::OfflineRecognizer::Create(config);
557
+ // Check if recognizer is valid by checking internal pointer
558
+ if (recognizer.Get() == nullptr) {
559
+ LOGE("Failed to create OfflineRecognizer: Create returned invalid object (nullptr)");
560
+ return false;
561
+ }
562
+ pImpl->recognizer = std::move(recognizer);
563
+ LOGI("OfflineRecognizer created successfully");
564
+ } catch (const std::exception& e) {
565
+ LOGE("Failed to create OfflineRecognizer: %s", e.what());
566
+ return false;
567
+ }
568
+
569
+ pImpl->modelDir = modelDir;
570
+ pImpl->initialized = true;
571
+ return true;
572
+ } catch (const std::exception& e) {
573
+ LOGE("Exception during initialization: %s", e.what());
574
+ return false;
575
+ } catch (...) {
576
+ LOGE("Unknown exception during initialization");
577
+ return false;
578
+ }
579
+ }
580
+
581
+ std::string SherpaOnnxWrapper::transcribeFile(const std::string& filePath) {
582
+ if (!pImpl->initialized || !pImpl->recognizer.has_value()) {
583
+ LOGE("Not initialized. Call initialize() first.");
584
+ return "";
585
+ }
586
+
587
+ try {
588
+ // Helper function to check if file exists
589
+ auto fileExists = [](const std::string& path) -> bool {
590
+ #if __cplusplus >= 201703L && __has_include(<filesystem>)
591
+ return std::filesystem::exists(path);
592
+ #elif __has_include(<experimental/filesystem>)
593
+ return std::experimental::filesystem::exists(path);
594
+ #else
595
+ struct stat buffer;
596
+ return (stat(path.c_str(), &buffer) == 0);
597
+ #endif
598
+ };
599
+
600
+ // Check if file exists
601
+ if (!fileExists(filePath)) {
602
+ LOGE("Audio file does not exist: %s", filePath.c_str());
603
+ return "";
604
+ }
605
+
606
+ // Read audio file using cxx-api
607
+ sherpa_onnx::cxx::Wave wave = sherpa_onnx::cxx::ReadWave(filePath);
608
+
609
+ if (wave.samples.empty()) {
610
+ LOGE("Failed to read wave file or file is empty: %s", filePath.c_str());
611
+ return "";
612
+ }
613
+
614
+ // Create a stream
615
+ auto stream = pImpl->recognizer.value().CreateStream();
616
+
617
+ // Feed audio data to the stream (all samples at once for offline recognition)
618
+ stream.AcceptWaveform(wave.sample_rate, wave.samples.data(), wave.samples.size());
619
+
620
+ // Decode the stream
621
+ pImpl->recognizer.value().Decode(&stream);
622
+
623
+ // Get result
624
+ auto result = pImpl->recognizer.value().GetResult(&stream);
625
+
626
+ return result.text;
627
+ } catch (const std::exception& e) {
628
+ LOGE("Exception during transcription: %s", e.what());
629
+ return "";
630
+ } catch (...) {
631
+ LOGE("Unknown exception during transcription");
632
+ return "";
633
+ }
634
+ }
635
+
636
+ bool SherpaOnnxWrapper::isInitialized() const {
637
+ return pImpl->initialized;
638
+ }
639
+
640
+ void SherpaOnnxWrapper::release() {
641
+ if (pImpl->initialized) {
642
+ // OfflineRecognizer uses RAII - destruction happens automatically when optional is reset
643
+ pImpl->recognizer.reset();
644
+ pImpl->initialized = false;
645
+ pImpl->modelDir.clear();
646
+ }
647
+ }
648
+
649
+ } // namespace sherpaonnx