react-native-sherpa-onnx 0.3.7 → 0.3.9
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +7 -2
- package/SherpaOnnx.podspec +4 -1
- package/android/prebuilt-download.gradle +23 -23
- package/android/src/main/assets/model_licenses/asr-models-license-status.csv +1 -0
- package/android/src/main/cpp/jni/model_detect/sherpa-onnx-model-detect-helper.cpp +23 -0
- package/android/src/main/cpp/jni/model_detect/sherpa-onnx-model-detect-helper.h +9 -0
- package/android/src/main/cpp/jni/model_detect/sherpa-onnx-model-detect-stt.cpp +51 -8
- package/android/src/main/cpp/jni/model_detect/sherpa-onnx-model-detect-tts.cpp +31 -4
- package/android/src/main/cpp/jni/model_detect/sherpa-onnx-model-detect.h +19 -1
- package/android/src/main/cpp/jni/model_detect/sherpa-onnx-stt-wrapper.cpp +5 -0
- package/android/src/main/cpp/jni/model_detect/sherpa-onnx-tts-wrapper.cpp +7 -0
- package/android/src/main/cpp/jni/model_detect/sherpa-onnx-validate-stt.cpp +11 -0
- package/android/src/main/cpp/jni/model_detect/sherpa-onnx-validate-tts.cpp +14 -0
- package/android/src/main/java/com/sherpaonnx/SherpaOnnxArchiveHelper.kt +110 -35
- package/android/src/main/java/com/sherpaonnx/SherpaOnnxExtractionNotificationHelper.kt +102 -0
- package/android/src/main/java/com/sherpaonnx/SherpaOnnxModule.kt +92 -18
- package/android/src/main/java/com/sherpaonnx/SherpaOnnxSttHelper.kt +22 -0
- package/android/src/main/java/com/sherpaonnx/SherpaOnnxTtsHelper.kt +15 -0
- package/ios/Resources/model_licenses/asr-models-license-status.csv +1 -0
- package/ios/SherpaOnnx+STT.mm +13 -1
- package/ios/SherpaOnnx+TTS.mm +1 -0
- package/ios/SherpaOnnx.mm +87 -17
- package/ios/model_detect/sherpa-onnx-model-detect-helper.h +5 -0
- package/ios/model_detect/sherpa-onnx-model-detect-helper.mm +23 -0
- package/ios/model_detect/sherpa-onnx-model-detect-stt.mm +51 -7
- package/ios/model_detect/sherpa-onnx-model-detect-tts.mm +36 -4
- package/ios/model_detect/sherpa-onnx-model-detect.h +19 -1
- package/ios/model_detect/sherpa-onnx-validate-stt.mm +11 -0
- package/ios/model_detect/sherpa-onnx-validate-tts.mm +14 -0
- package/ios/stt/sherpa-onnx-stt-wrapper.h +11 -1
- package/ios/stt/sherpa-onnx-stt-wrapper.mm +30 -2
- package/ios/tts/sherpa-onnx-tts-wrapper.mm +25 -0
- package/lib/module/NativeSherpaOnnx.js.map +1 -1
- package/lib/module/download/ModelDownloadManager.js +1 -1
- package/lib/module/download/ModelDownloadManager.js.map +1 -1
- package/lib/module/download/background-downloader-types.js +2 -0
- package/lib/module/download/background-downloader-types.js.map +1 -0
- package/lib/module/download/downloadTask.js +54 -1
- package/lib/module/download/downloadTask.js.map +1 -1
- package/lib/module/download/index.js +1 -1
- package/lib/module/download/index.js.map +1 -1
- package/lib/module/download/postDownloadProcessing.js +17 -4
- package/lib/module/download/postDownloadProcessing.js.map +1 -1
- package/lib/module/download/registry.js +1 -0
- package/lib/module/download/registry.js.map +1 -1
- package/lib/module/extraction/extractTarBz2.js +2 -2
- package/lib/module/extraction/extractTarBz2.js.map +1 -1
- package/lib/module/extraction/extractTarZst.js +2 -2
- package/lib/module/extraction/extractTarZst.js.map +1 -1
- package/lib/module/extraction/index.js +10 -5
- package/lib/module/extraction/index.js.map +1 -1
- package/lib/module/stt/index.js +4 -2
- package/lib/module/stt/index.js.map +1 -1
- package/lib/module/stt/streaming.js +2 -1
- package/lib/module/stt/streaming.js.map +1 -1
- package/lib/module/stt/types.js +3 -1
- package/lib/module/stt/types.js.map +1 -1
- package/lib/module/tts/index.js +5 -3
- package/lib/module/tts/index.js.map +1 -1
- package/lib/module/tts/streaming.js +4 -2
- package/lib/module/tts/streaming.js.map +1 -1
- package/lib/module/tts/types.js +4 -1
- package/lib/module/tts/types.js.map +1 -1
- package/lib/typescript/src/NativeSherpaOnnx.d.ts +26 -10
- package/lib/typescript/src/NativeSherpaOnnx.d.ts.map +1 -1
- package/lib/typescript/src/download/ModelDownloadManager.d.ts +2 -1
- package/lib/typescript/src/download/ModelDownloadManager.d.ts.map +1 -1
- package/lib/typescript/src/download/background-downloader-types.d.ts +64 -0
- package/lib/typescript/src/download/background-downloader-types.d.ts.map +1 -0
- package/lib/typescript/src/download/downloadTask.d.ts +10 -0
- package/lib/typescript/src/download/downloadTask.d.ts.map +1 -1
- package/lib/typescript/src/download/index.d.ts +2 -2
- package/lib/typescript/src/download/index.d.ts.map +1 -1
- package/lib/typescript/src/download/postDownloadProcessing.d.ts +9 -0
- package/lib/typescript/src/download/postDownloadProcessing.d.ts.map +1 -1
- package/lib/typescript/src/download/registry.d.ts.map +1 -1
- package/lib/typescript/src/extraction/extractTarBz2.d.ts +2 -1
- package/lib/typescript/src/extraction/extractTarBz2.d.ts.map +1 -1
- package/lib/typescript/src/extraction/extractTarZst.d.ts +2 -1
- package/lib/typescript/src/extraction/extractTarZst.d.ts.map +1 -1
- package/lib/typescript/src/extraction/index.d.ts +1 -1
- package/lib/typescript/src/extraction/index.d.ts.map +1 -1
- package/lib/typescript/src/extraction/types.d.ts +12 -0
- package/lib/typescript/src/extraction/types.d.ts.map +1 -1
- package/lib/typescript/src/stt/index.d.ts +1 -1
- package/lib/typescript/src/stt/index.d.ts.map +1 -1
- package/lib/typescript/src/stt/streaming.d.ts.map +1 -1
- package/lib/typescript/src/stt/types.d.ts +16 -1
- package/lib/typescript/src/stt/types.d.ts.map +1 -1
- package/lib/typescript/src/tts/index.d.ts +1 -1
- package/lib/typescript/src/tts/index.d.ts.map +1 -1
- package/lib/typescript/src/tts/streaming.d.ts.map +1 -1
- package/lib/typescript/src/tts/types.d.ts +6 -1
- package/lib/typescript/src/tts/types.d.ts.map +1 -1
- package/package.json +1 -1
- package/scripts/ci/update_model_license_csv.sh +16 -16
- package/src/NativeSherpaOnnx.ts +38 -11
- package/src/download/ModelDownloadManager.ts +2 -0
- package/src/download/background-downloader-types.ts +73 -0
- package/src/download/downloadTask.ts +68 -0
- package/src/download/index.ts +2 -0
- package/src/download/postDownloadProcessing.ts +24 -1
- package/src/download/registry.ts +1 -0
- package/src/extraction/extractTarBz2.ts +7 -2
- package/src/extraction/extractTarZst.ts +7 -2
- package/src/extraction/index.ts +29 -6
- package/src/extraction/types.ts +16 -0
- package/src/stt/index.ts +8 -7
- package/src/stt/streaming.ts +7 -1
- package/src/stt/types.ts +18 -0
- package/src/tts/index.ts +10 -7
- package/src/tts/streaming.ts +8 -3
- package/src/tts/types.ts +9 -0
- package/third_party/sherpa-onnx-prebuilt/ANDROID_RELEASE_TAG +1 -1
- package/third_party/sherpa-onnx-prebuilt/IOS_RELEASE_TAG +1 -1
- package/lib/module/download/background-downloader.d.js +0 -2
- package/lib/module/download/background-downloader.d.js.map +0 -1
- package/src/download/background-downloader.d.ts +0 -43
package/README.md
CHANGED
|
@@ -78,6 +78,8 @@ If you use the [download manager](docs/download-manager.md) to fetch models at r
|
|
|
78
78
|
|
|
79
79
|
Full step-by-step: [Download manager – Setup (iOS & Android)](docs/download-manager.md#setup-ios--android). Expo users can use the library’s config plugin to apply this automatically.
|
|
80
80
|
|
|
81
|
+
**Android:** Foreground service permissions (Play Console), visible download notifications, and **`POST_NOTIFICATIONS` (API 33+)** are covered in [Download manager – Android: foreground service & notifications](docs/download-manager.md#android-foreground-service--notifications).
|
|
82
|
+
|
|
81
83
|
## Table of contents
|
|
82
84
|
|
|
83
85
|
- [Bundled sherpa-onnx version](#bundled-sherpa-onnx-version)
|
|
@@ -124,8 +126,8 @@ Full step-by-step: [Download manager – Setup (iOS & Android)](docs/download-ma
|
|
|
124
126
|
| Model quantization | ✅ **Supported** | [Model setup](./docs/model-setup.md) | Automatic detection and preference for quantized (int8) models. |
|
|
125
127
|
| Flexible model loading | ✅ **Supported** | [Model setup](./docs/model-setup.md) | Asset models, file system models, or auto-detection. |
|
|
126
128
|
| TypeScript | ✅ **Supported** | — | Full type definitions included. |
|
|
127
|
-
|
|
|
128
|
-
|
|
|
129
|
+
| Speech Enhancement | ❌ Not yet supported | [Enhancement](./docs/enhancement.md) | Scheduled for release 0.4.0 |
|
|
130
|
+
| Speaker Diarization | ❌ Not yet supported | [Diarization](./docs/diarization.md) | Scheduled for release 0.5.0 |
|
|
129
131
|
| Source Separation | ❌ Not yet supported | [Separation](./docs/separation.md) | Scheduled for release 0.6.0 |
|
|
130
132
|
| VAD (Voice Activity Detection) | ❌ Not yet supported | [VAD](./docs/vad.md) | Scheduled for release 0.7.0 |
|
|
131
133
|
|
|
@@ -146,6 +148,7 @@ Full step-by-step: [Download manager – Setup (iOS & Android)](docs/download-ma
|
|
|
146
148
|
|
|
147
149
|
| Model Type | `modelType` Value | Description | Download Links |
|
|
148
150
|
| ------------------------ | ----------------- | ---------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------ |
|
|
151
|
+
| **Auto Detect** | `'auto'` | Automatically detects model layout/type from files in the model folder and picks the best supported STT type. | n/a |
|
|
149
152
|
| **Zipformer/Transducer** | `'transducer'` | Encoder–decoder–joiner (e.g. icefall). Good balance of speed and accuracy. Folder name should contain **zipformer** or **transducer** for auto-detection. | [Download](https://k2-fsa.github.io/sherpa/onnx/pretrained_models/offline-transducer/index.html) |
|
|
150
153
|
| **LSTM Transducer** | `'transducer'` | Same layout as Zipformer (encoder–decoder–joiner). LSTM-based streaming ASR; detected as transducer. Folder name may contain **lstm**. | [Download](https://k2-fsa.github.io/sherpa/onnx/pretrained_models/online-transducer/lstm-transducer-models.html) |
|
|
151
154
|
| **Paraformer** | `'paraformer'` | Single-model non-autoregressive ASR; fast and accurate. Detected by `model.onnx`; no folder token required. | [Download](https://k2-fsa.github.io/sherpa/onnx/pretrained_models/offline-paraformer/index.html) |
|
|
@@ -170,12 +173,14 @@ For **real-time (streaming) recognition** from a microphone or audio stream, use
|
|
|
170
173
|
|
|
171
174
|
| Model Type | `modelType` Value | Description | Download Links |
|
|
172
175
|
| ---------------- | ----------------- | ---------------------------------------------------------------------------------------------------- | ----------------------------------------------------------------------------------- |
|
|
176
|
+
| **Auto Detect** | `'auto'` | Automatically detects the TTS model layout from files in the model folder and selects the matching supported type. | n/a |
|
|
173
177
|
| **VITS** | `'vits'` | Fast, high-quality TTS (Piper, Coqui, MeloTTS, MMS). Folder name should contain **vits** if used with other voice models. | [Download](https://github.com/k2-fsa/sherpa-onnx/releases/tag/tts-models) |
|
|
174
178
|
| **Matcha** | `'matcha'` | High-quality acoustic model + vocoder. Detected by acoustic_model + vocoder; no folder token required. | [Download](https://k2-fsa.github.io/sherpa/onnx/tts/pretrained_models/matcha.html) |
|
|
175
179
|
| **Kokoro** | `'kokoro'` | Multi-speaker, multi-language. Folder name should contain **kokoro** (not kitten) for auto-detection. | [Download](https://github.com/k2-fsa/sherpa-onnx/releases/tag/tts-models) |
|
|
176
180
|
| **KittenTTS** | `'kitten'` | Lightweight, multi-speaker. Folder name should contain **kitten** (not kokoro) for auto-detection. | [Download](https://github.com/k2-fsa/sherpa-onnx/releases/tag/tts-models) |
|
|
177
181
|
| **Zipvoice** | `'zipvoice'` | Standard TTS with **`sid`**. **Voice cloning** (reference audio + `referenceText`): batch via **`generateSpeech`** only—streaming TTS does not support reference audio for Zipvoice. Default **`numSteps`** when omitted is **5** on **Android and iOS** (matches sherpa-onnx `GenerationConfig` / Kotlin helper). Cloning is **supported on Android & iOS**. Encoder + decoder + vocoder. | [Download](https://k2-fsa.github.io/sherpa/onnx/tts/pretrained_models/zipvoice.html) |
|
|
178
182
|
| **Pocket** | `'pocket'` | Flow-matching TTS. **Voice cloning** on **Android:** batch and streaming TTS. **iOS:** cloning is experimental. Detected by lm_flow, lm_main, text_conditioner, vocab/token_scores. | [Download](https://github.com/k2-fsa/sherpa-onnx/releases/tag/tts-models) |
|
|
183
|
+
| **Supertonic** | `'supertonic'` | Lightning-fast, on-device text-to-speech system designed for extreme performance with minimal computational overhead. | [Download](https://github.com/k2-fsa/sherpa-onnx/releases/tag/tts-models) |
|
|
179
184
|
|
|
180
185
|
For **streaming TTS** (incremental generation, low latency), use `createStreamingTTS()` with supported model types. See [Streaming Text-to-Speech](./docs/tts-streaming.md).
|
|
181
186
|
|
package/SherpaOnnx.podspec
CHANGED
|
@@ -140,7 +140,10 @@ Pod::Spec.new do |s|
|
|
|
140
140
|
s.libraries = "c++", "z", "iconv", "bz2"
|
|
141
141
|
|
|
142
142
|
# Per-release-model license metadata (synced from CI; same CSV as android/src/main/assets/model_licenses/).
|
|
143
|
-
|
|
143
|
+
# Use resource_bundles so assets are packaged reliably across CocoaPods integration modes.
|
|
144
|
+
s.resource_bundles = {
|
|
145
|
+
"SherpaOnnxResources" => ["ios/Resources/model_licenses/*.csv"]
|
|
146
|
+
}
|
|
144
147
|
|
|
145
148
|
install_modules_dependencies(s)
|
|
146
149
|
end
|
|
@@ -222,14 +222,14 @@ project.tasks.register("downloadNativeLibsIfNeeded") {
|
|
|
222
222
|
sherpaVersionFile.text = currentSherpaVersion
|
|
223
223
|
sherpaResolved = true
|
|
224
224
|
println "[sherpa-onnx] jniLibs (*.so per ABI) .............. MAVEN_AAR ${aar.name}"
|
|
225
|
-
println "[sherpa-onnx] install: jni/<abi>/*.so
|
|
226
|
-
println "[sherpa-onnx] C headers (sherpa-onnx) ............. ${copiedHeaders ? 'MAVEN_AAR c-api/**
|
|
225
|
+
println "[sherpa-onnx] install: jni/<abi>/*.so --> ${jniLibsHuman}/<abi>/"
|
|
226
|
+
println "[sherpa-onnx] C headers (sherpa-onnx) ............. ${copiedHeaders ? 'MAVEN_AAR c-api/** --> ' + includeSherpaDir : 'unchanged (no c-api/ in AAR; existing tree kept)'}"
|
|
227
227
|
println "[sherpa-onnx] version stamp ...................... written ${sherpaVersionFile.name}=${currentSherpaVersion}"
|
|
228
228
|
} else {
|
|
229
|
-
println "[sherpa-onnx] MAVEN_AAR: sherpaOnnxAar empty
|
|
229
|
+
println "[sherpa-onnx] MAVEN_AAR: sherpaOnnxAar empty --> trying GITHUB_RELEASE"
|
|
230
230
|
}
|
|
231
231
|
} catch (Exception e) {
|
|
232
|
-
println "[sherpa-onnx] MAVEN_AAR failed: ${e.message}
|
|
232
|
+
println "[sherpa-onnx] MAVEN_AAR failed: ${e.message} --> trying GITHUB_RELEASE"
|
|
233
233
|
}
|
|
234
234
|
}
|
|
235
235
|
} else {
|
|
@@ -290,14 +290,14 @@ project.tasks.register("downloadNativeLibsIfNeeded") {
|
|
|
290
290
|
ffmpegVersionFile.text = currentFfmpegVersion
|
|
291
291
|
ffmpegResolved = true
|
|
292
292
|
println "[FFmpeg] jniLibs .................................. MAVEN_AAR ${aar.name}"
|
|
293
|
-
println "[FFmpeg] install: jni/<abi>/*.so
|
|
294
|
-
println "[FFmpeg] C headers ................................ ${copiedHdr ? 'MAVEN_AAR include/**
|
|
293
|
+
println "[FFmpeg] install: jni/<abi>/*.so --> ${jniLibsHuman}/<abi>/"
|
|
294
|
+
println "[FFmpeg] C headers ................................ ${copiedHdr ? 'MAVEN_AAR include/** --> ' + ffmpegIncludeDir : 'unchanged (no include/ in AAR)'}"
|
|
295
295
|
println "[FFmpeg] version stamp ............................ written ${ffmpegVersionFile.name}=${currentFfmpegVersion}"
|
|
296
296
|
} else {
|
|
297
|
-
println "[FFmpeg] MAVEN_AAR: ffmpegAar empty
|
|
297
|
+
println "[FFmpeg] MAVEN_AAR: ffmpegAar empty --> trying GITHUB_RELEASE"
|
|
298
298
|
}
|
|
299
299
|
} catch (Exception e) {
|
|
300
|
-
println "[FFmpeg] MAVEN_AAR failed: ${e.message}
|
|
300
|
+
println "[FFmpeg] MAVEN_AAR failed: ${e.message} --> trying GITHUB_RELEASE"
|
|
301
301
|
}
|
|
302
302
|
}
|
|
303
303
|
} else {
|
|
@@ -358,14 +358,14 @@ project.tasks.register("downloadNativeLibsIfNeeded") {
|
|
|
358
358
|
libarchiveVersionFile.text = currentLibarchiveVersion
|
|
359
359
|
libarchiveResolved = true
|
|
360
360
|
println "[libarchive] jniLibs .............................. MAVEN_AAR ${aar.name}"
|
|
361
|
-
println "[libarchive] install: jni/<abi>/*.so
|
|
362
|
-
println "[libarchive] C headers ............................ ${copiedHdr ? 'MAVEN_AAR include/**
|
|
361
|
+
println "[libarchive] install: jni/<abi>/*.so --> ${jniLibsHuman}/<abi>/"
|
|
362
|
+
println "[libarchive] C headers ............................ ${copiedHdr ? 'MAVEN_AAR include/** --> ' + libarchiveIncludeDir : 'unchanged (no include/ in AAR)'}"
|
|
363
363
|
println "[libarchive] version stamp ........................ written ${libarchiveVersionFile.name}=${currentLibarchiveVersion}"
|
|
364
364
|
} else {
|
|
365
|
-
println "[libarchive] MAVEN_AAR: libarchiveAar empty
|
|
365
|
+
println "[libarchive] MAVEN_AAR: libarchiveAar empty --> trying GITHUB_RELEASE"
|
|
366
366
|
}
|
|
367
367
|
} catch (Exception e) {
|
|
368
|
-
println "[libarchive] MAVEN_AAR failed: ${e.message}
|
|
368
|
+
println "[libarchive] MAVEN_AAR failed: ${e.message} --> trying GITHUB_RELEASE"
|
|
369
369
|
}
|
|
370
370
|
}
|
|
371
371
|
} else {
|
|
@@ -415,7 +415,7 @@ project.tasks.register("downloadNativeLibsIfNeeded") {
|
|
|
415
415
|
}
|
|
416
416
|
ortJniResolved = true
|
|
417
417
|
println "[onnxruntime] libonnxruntime4j_jni.so .......... MAVEN_AAR ${aar.name}"
|
|
418
|
-
println "[onnxruntime] install: per ABI
|
|
418
|
+
println "[onnxruntime] install: per ABI --> ${jniLibsHuman}/<abi>/ (only JNI bridge; libonnxruntime.so from sherpa prebuilts)"
|
|
419
419
|
} else {
|
|
420
420
|
println "[onnxruntime] MAVEN_AAR: onnxruntimeAar empty — libonnxruntime4j_jni.so still missing"
|
|
421
421
|
}
|
|
@@ -436,7 +436,7 @@ project.tasks.register("downloadNativeLibsIfNeeded") {
|
|
|
436
436
|
def needLibarchive = !sherpaOnnxDisableLibarchive && !libarchiveResolved
|
|
437
437
|
def needSherpa = !sherpaResolved
|
|
438
438
|
if (needFfmpeg || needLibarchive || needSherpa) {
|
|
439
|
-
println "[prebuilt] GITHUB_RELEASE: skipped (no repo). Set -PprebuiltGitHubRepo=owner/repo or git remote origin
|
|
439
|
+
println "[prebuilt] GITHUB_RELEASE: skipped (no repo). Set -PprebuiltGitHubRepo=owner/repo or git remote origin --> github.com"
|
|
440
440
|
println "[prebuilt] still need: sherpa=${needSherpa}, ffmpeg=${needFfmpeg}, libarchive=${needLibarchive}"
|
|
441
441
|
def diag = [
|
|
442
442
|
"prebuiltGitHubRepo=${project.findProperty('prebuiltGitHubRepo') ?: '(not set)'}",
|
|
@@ -492,7 +492,7 @@ project.tasks.register("downloadNativeLibsIfNeeded") {
|
|
|
492
492
|
ffmpegVersionFile.text = currentFfmpegVersion
|
|
493
493
|
println "[FFmpeg] jniLibs + C headers ...................... GITHUB_RELEASE tag=${tag}"
|
|
494
494
|
println "[FFmpeg] url: ${url}"
|
|
495
|
-
println "[FFmpeg] install: <abi>/*.so
|
|
495
|
+
println "[FFmpeg] install: <abi>/*.so --> ${jniLibsHuman}/"
|
|
496
496
|
println "[FFmpeg] version stamp .......................... written ${ffmpegVersionFile.name}=${currentFfmpegVersion}"
|
|
497
497
|
}
|
|
498
498
|
|
|
@@ -522,7 +522,7 @@ project.tasks.register("downloadNativeLibsIfNeeded") {
|
|
|
522
522
|
libarchiveVersionFile.text = currentLibarchiveVersion
|
|
523
523
|
println "[libarchive] jniLibs + C headers .................. GITHUB_RELEASE tag=${tag}"
|
|
524
524
|
println "[libarchive] url: ${url}"
|
|
525
|
-
println "[libarchive] install: <abi>/*.so
|
|
525
|
+
println "[libarchive] install: <abi>/*.so --> ${jniLibsHuman}/"
|
|
526
526
|
println "[libarchive] version stamp ........................ written ${libarchiveVersionFile.name}=${currentLibarchiveVersion}"
|
|
527
527
|
}
|
|
528
528
|
|
|
@@ -560,8 +560,8 @@ project.tasks.register("downloadNativeLibsIfNeeded") {
|
|
|
560
560
|
sherpaVersionFile.text = currentSherpaVersion
|
|
561
561
|
println "[sherpa-onnx] jniLibs + C headers ................. GITHUB_RELEASE tag=${tag}"
|
|
562
562
|
println "[sherpa-onnx] url: ${url}"
|
|
563
|
-
println "[sherpa-onnx] install: <abi>/*.so
|
|
564
|
-
println "[sherpa-onnx] classes.jar ......................... ${sherpaJavaJar.exists() ? 'GITHUB_RELEASE
|
|
563
|
+
println "[sherpa-onnx] install: <abi>/*.so --> ${jniLibsHuman}/"
|
|
564
|
+
println "[sherpa-onnx] classes.jar ......................... ${sherpaJavaJar.exists() ? 'GITHUB_RELEASE --> ' + sherpaOnnxClassesDir : 'not in zip (use extractSherpaOnnxClasses)'}"
|
|
565
565
|
println "[sherpa-onnx] version stamp ...................... written ${sherpaVersionFile.name}=${currentSherpaVersion}"
|
|
566
566
|
}
|
|
567
567
|
println ""
|
|
@@ -612,7 +612,7 @@ project.afterEvaluate {
|
|
|
612
612
|
project.tasks.findByName('preBuild')?.dependsOn(project.tasks.findByName('checkJniLibs'))
|
|
613
613
|
}
|
|
614
614
|
|
|
615
|
-
// sherpa-onnx classes.jar: resolution order THIRD_PARTY
|
|
615
|
+
// sherpa-onnx classes.jar: resolution order THIRD_PARTY --> MAVEN_AAR --> GITHUB_EXTRACT
|
|
616
616
|
def sherpaLocalJar = file("${project.projectDir.parent}/third_party/sherpa-onnx-prebuilt/android/java/classes.jar")
|
|
617
617
|
def sherpaExtractedJar = file("${project.buildDir}/prebuilt-downloads/sherpa-onnx-extract/java/classes.jar")
|
|
618
618
|
|
|
@@ -627,7 +627,7 @@ project.tasks.register("extractSherpaOnnxClasses") {
|
|
|
627
627
|
copy { from sherpaLocalJar; into sherpaOnnxClassesDir }
|
|
628
628
|
println "[prebuilt] extractSherpaOnnxClasses"
|
|
629
629
|
println "[sherpa-onnx] classes.jar (Kotlin API) .......... THIRD_PARTY"
|
|
630
|
-
println "[sherpa-onnx] ${sherpaLocalJar.absolutePath}
|
|
630
|
+
println "[sherpa-onnx] ${sherpaLocalJar.absolutePath} --> ${sherpaOnnxClassesDir}"
|
|
631
631
|
return
|
|
632
632
|
}
|
|
633
633
|
def aarFiles = project.configurations.sherpaOnnxAar.files
|
|
@@ -640,14 +640,14 @@ project.tasks.register("extractSherpaOnnxClasses") {
|
|
|
640
640
|
}
|
|
641
641
|
println "[prebuilt] extractSherpaOnnxClasses"
|
|
642
642
|
println "[sherpa-onnx] classes.jar (Kotlin API) .......... MAVEN_AAR ${aar.name}"
|
|
643
|
-
println "[sherpa-onnx] classes.jar
|
|
643
|
+
println "[sherpa-onnx] classes.jar --> ${sherpaOnnxClassesDir}"
|
|
644
644
|
return
|
|
645
645
|
}
|
|
646
646
|
if (sherpaExtractedJar.exists()) {
|
|
647
647
|
copy { from sherpaExtractedJar; into sherpaOnnxClassesDir }
|
|
648
648
|
println "[prebuilt] extractSherpaOnnxClasses"
|
|
649
649
|
println "[sherpa-onnx] classes.jar (Kotlin API) .......... GITHUB_EXTRACT"
|
|
650
|
-
println "[sherpa-onnx] ${sherpaExtractedJar.absolutePath}
|
|
650
|
+
println "[sherpa-onnx] ${sherpaExtractedJar.absolutePath} --> ${sherpaOnnxClassesDir}"
|
|
651
651
|
return
|
|
652
652
|
}
|
|
653
653
|
throw new RuntimeException(
|
|
@@ -678,7 +678,7 @@ project.tasks.register("extractOnnxruntimeClasses") {
|
|
|
678
678
|
}
|
|
679
679
|
println "[prebuilt] extractOnnxruntimeClasses"
|
|
680
680
|
println "[onnxruntime] classes.jar (Java API) ........... MAVEN_AAR ${aar.name}"
|
|
681
|
-
println "[onnxruntime] renamed to onnxruntime-classes.jar
|
|
681
|
+
println "[onnxruntime] renamed to onnxruntime-classes.jar --> ${onnxruntimeClassesDir}"
|
|
682
682
|
return
|
|
683
683
|
}
|
|
684
684
|
throw new RuntimeException(
|
|
@@ -397,6 +397,7 @@ sherpa-onnx-rk3576-streaming-zipformer-en-2023-06-26.tar.bz2,apache-2.0,yes,high
|
|
|
397
397
|
sherpa-onnx-rk3568-streaming-zipformer-en-2023-06-26.tar.bz2,apache-2.0,yes,high,manual,https://huggingface.co/csukuangfj/sherpa-onnx-streaming-zipformer-en-2023-06-26
|
|
398
398
|
sherpa-onnx-rk3566-streaming-zipformer-en-2023-06-26.tar.bz2,apache-2.0,yes,high,manual,https://huggingface.co/csukuangfj/sherpa-onnx-streaming-zipformer-en-2023-06-26
|
|
399
399
|
sherpa-onnx-rk3562-streaming-zipformer-en-2023-06-26.tar.bz2,apache-2.0,yes,high,manual,https://huggingface.co/csukuangfj/sherpa-onnx-streaming-zipformer-en-2023-06-26
|
|
400
|
+
sherpa-onnx-qwen3-asr-0.6B-int8-2026-03-25.tar.bz2,apache-2.0,yes,high,manual,https://huggingface.co/Qwen/Qwen3-ASR-0.6B
|
|
400
401
|
sherpa-onnx-rk3588-streaming-zipformer-small-bilingual-zh-en-2023-02-16.tar.bz2,apache-2.0,yes,high,manual,https://huggingface.co/csukuangfj/k2fsa-zipformer-bilingual-zh-en-t
|
|
401
402
|
sherpa-onnx-rk3588-streaming-zipformer-bilingual-zh-en-2023-02-20.tar.bz2,apache-2.0,yes,high,manual,https://huggingface.co/csukuangfj/k2fsa-zipformer-bilingual-zh-en-t
|
|
402
403
|
sherpa-onnx-rk3576-streaming-zipformer-small-bilingual-zh-en-2023-02-16.tar.bz2,apache-2.0,yes,high,manual,https://huggingface.co/csukuangfj/k2fsa-zipformer-bilingual-zh-en-t
|
|
@@ -389,5 +389,28 @@ std::vector<LexiconCandidate> FindLexiconCandidates(
|
|
|
389
389
|
return candidates;
|
|
390
390
|
}
|
|
391
391
|
|
|
392
|
+
bool Qwen3TokenizerDirHasVocabAndMerges(
|
|
393
|
+
const std::vector<FileEntry>& files,
|
|
394
|
+
const std::string& dirRaw
|
|
395
|
+
) {
|
|
396
|
+
std::string dir = dirRaw;
|
|
397
|
+
while (!dir.empty() && (dir.back() == '/' || dir.back() == '\\'))
|
|
398
|
+
dir.pop_back();
|
|
399
|
+
if (dir.empty()) return false;
|
|
400
|
+
bool hasVocab = false;
|
|
401
|
+
bool hasMerges = false;
|
|
402
|
+
const std::string prefix = dir + "/";
|
|
403
|
+
for (const auto& e : files) {
|
|
404
|
+
if (e.path.size() <= prefix.size()) continue;
|
|
405
|
+
if (e.path.compare(0, prefix.size(), prefix) != 0) continue;
|
|
406
|
+
std::string rest = e.path.substr(prefix.size());
|
|
407
|
+
if (rest.find('/') != std::string::npos || rest.find('\\') != std::string::npos) continue;
|
|
408
|
+
if (e.nameLower == "vocab.json") hasVocab = true;
|
|
409
|
+
if (e.nameLower == "merges.txt") hasMerges = true;
|
|
410
|
+
}
|
|
411
|
+
if (hasVocab && hasMerges) return true;
|
|
412
|
+
return FileExists(dir + "/vocab.json") && FileExists(dir + "/merges.txt");
|
|
413
|
+
}
|
|
414
|
+
|
|
392
415
|
} // namespace model_detect
|
|
393
416
|
} // namespace sherpaonnx
|
|
@@ -88,6 +88,15 @@ std::vector<LexiconCandidate> FindLexiconCandidates(
|
|
|
88
88
|
const std::string& rootDir
|
|
89
89
|
);
|
|
90
90
|
|
|
91
|
+
/**
|
|
92
|
+
* True if `dir` contains vocab.json and merges.txt: listed in `files` (fixture / synthetic trees)
|
|
93
|
+
* or present on disk. Used for Qwen3-ASR tokenizer directory detection.
|
|
94
|
+
*/
|
|
95
|
+
bool Qwen3TokenizerDirHasVocabAndMerges(
|
|
96
|
+
const std::vector<FileEntry>& files,
|
|
97
|
+
const std::string& dir
|
|
98
|
+
);
|
|
99
|
+
|
|
91
100
|
} // namespace model_detect
|
|
92
101
|
} // namespace sherpaonnx
|
|
93
102
|
|
|
@@ -61,6 +61,7 @@ static const char* KindToName(SttModelKind k) {
|
|
|
61
61
|
case SttModelKind::kZipformerCtc: return "zipformer_ctc";
|
|
62
62
|
case SttModelKind::kWhisper: return "whisper";
|
|
63
63
|
case SttModelKind::kFunAsrNano: return "funasr_nano";
|
|
64
|
+
case SttModelKind::kQwen3Asr: return "qwen3_asr";
|
|
64
65
|
case SttModelKind::kFireRedAsr: return "fire_red_asr";
|
|
65
66
|
case SttModelKind::kMoonshine: return "moonshine";
|
|
66
67
|
case SttModelKind::kMoonshineV2: return "moonshine_v2";
|
|
@@ -88,6 +89,7 @@ SttModelKind ParseSttModelType(const std::string& modelType) {
|
|
|
88
89
|
if (modelType == "zipformer_ctc" || modelType == "ctc") return SttModelKind::kZipformerCtc;
|
|
89
90
|
if (modelType == "whisper") return SttModelKind::kWhisper;
|
|
90
91
|
if (modelType == "funasr_nano") return SttModelKind::kFunAsrNano;
|
|
92
|
+
if (modelType == "qwen3_asr") return SttModelKind::kQwen3Asr;
|
|
91
93
|
if (modelType == "fire_red_asr") return SttModelKind::kFireRedAsr;
|
|
92
94
|
if (modelType == "moonshine") return SttModelKind::kMoonshine;
|
|
93
95
|
if (modelType == "moonshine_v2") return SttModelKind::kMoonshineV2;
|
|
@@ -126,6 +128,8 @@ static bool CapabilitySupportsKind(
|
|
|
126
128
|
return cap.hasWhisper;
|
|
127
129
|
case SttModelKind::kFunAsrNano:
|
|
128
130
|
return cap.hasFunAsrNano;
|
|
131
|
+
case SttModelKind::kQwen3Asr:
|
|
132
|
+
return cap.hasQwen3Asr;
|
|
129
133
|
case SttModelKind::kFireRedAsr:
|
|
130
134
|
return cap.hasFireRedAsr;
|
|
131
135
|
case SttModelKind::kMoonshine:
|
|
@@ -189,6 +193,8 @@ static std::vector<SttModelKind> GetKindsFromDirName(const std::string& modelDir
|
|
|
189
193
|
add(SttModelKind::kTransducer);
|
|
190
194
|
add(SttModelKind::kZipformerCtc);
|
|
191
195
|
}
|
|
196
|
+
if (lower.find("qwen3-asr") != std::string::npos || lower.find("qwen3_asr") != std::string::npos)
|
|
197
|
+
add(SttModelKind::kQwen3Asr);
|
|
192
198
|
if (lower.find("funasr") != std::string::npos)
|
|
193
199
|
add(SttModelKind::kFunAsrNano);
|
|
194
200
|
if (lower.find("canary") != std::string::npos)
|
|
@@ -249,6 +255,19 @@ static SttCandidatePaths GatherSttCandidatePaths(
|
|
|
249
255
|
p.funasrTokenizerDir = vocabInSubdir.substr(0, lastSlash);
|
|
250
256
|
}
|
|
251
257
|
}
|
|
258
|
+
p.qwen3ConvFrontend = FindOnnxByAnyToken(files, {"conv_frontend"}, preferInt8);
|
|
259
|
+
{
|
|
260
|
+
for (const auto& entry : files) {
|
|
261
|
+
if (entry.nameLower != "tokenizer_config.json") continue;
|
|
262
|
+
size_t slash = entry.path.find_last_of("/\\");
|
|
263
|
+
if (slash == std::string::npos) continue;
|
|
264
|
+
std::string dir = entry.path.substr(0, slash);
|
|
265
|
+
if (Qwen3TokenizerDirHasVocabAndMerges(files, dir)) {
|
|
266
|
+
p.qwen3TokenizerDir = dir;
|
|
267
|
+
break;
|
|
268
|
+
}
|
|
269
|
+
}
|
|
270
|
+
}
|
|
252
271
|
p.moonshinePreprocessor = FindOnnxByAnyToken(files, {"preprocess", "preprocessor"}, preferInt8);
|
|
253
272
|
p.moonshineEncoder = FindOnnxByAnyToken(files, {"encode", "encoder_model"}, preferInt8);
|
|
254
273
|
p.moonshineUncachedDecoder = FindOnnxByAnyToken(files, {"uncached_decode", "uncached"}, preferInt8);
|
|
@@ -258,7 +277,8 @@ static SttCandidatePaths GatherSttCandidatePaths(
|
|
|
258
277
|
static const std::vector<std::string> modelExcludes = {
|
|
259
278
|
"encoder", "decoder", "joiner", "vocoder", "acoustic", "embedding", "llm",
|
|
260
279
|
"encoder_adaptor", "encoder-adaptor", "encoder_model", "decoder_model",
|
|
261
|
-
"merged_decoder", "decoder_model_merged", "preprocess", "encode", "uncached", "cached"
|
|
280
|
+
"merged_decoder", "decoder_model_merged", "preprocess", "encode", "uncached", "cached",
|
|
281
|
+
"conv_frontend"
|
|
262
282
|
};
|
|
263
283
|
p.paraformerModel = FindOnnxByAnyToken(files, {"model"}, preferInt8);
|
|
264
284
|
if (!p.paraformerModel.empty()) {
|
|
@@ -302,6 +322,7 @@ static SttPathHints GetSttPathHints(const std::string& modelDir) {
|
|
|
302
322
|
h.isLikelyWenetCtc = lower.find("wenet") != std::string::npos;
|
|
303
323
|
h.isLikelySenseVoice = lower.find("sense") != std::string::npos || lower.find("sensevoice") != std::string::npos;
|
|
304
324
|
h.isLikelyFunAsrNano = lower.find("funasr") != std::string::npos || lower.find("funasr-nano") != std::string::npos;
|
|
325
|
+
h.isLikelyQwen3Asr = lower.find("qwen3-asr") != std::string::npos || lower.find("qwen3_asr") != std::string::npos;
|
|
305
326
|
h.isLikelyZipformer = lower.find("zipformer") != std::string::npos;
|
|
306
327
|
h.isLikelyMoonshine = lower.find("moonshine") != std::string::npos;
|
|
307
328
|
h.isLikelyDolphin = lower.find("dolphin") != std::string::npos;
|
|
@@ -404,7 +425,9 @@ static SttCapabilities ComputeSttCapabilities(const SttCandidatePaths& paths, co
|
|
|
404
425
|
c.hasTransducer = !paths.encoder.empty() && !paths.decoder.empty() && !paths.joiner.empty();
|
|
405
426
|
bool hasWhisperEnc = !paths.encoder.empty();
|
|
406
427
|
bool hasWhisperDec = !paths.decoder.empty();
|
|
407
|
-
|
|
428
|
+
bool hasQwen3Tok = !paths.qwen3TokenizerDir.empty();
|
|
429
|
+
c.hasQwen3Asr = !paths.qwen3ConvFrontend.empty() && hasWhisperEnc && hasWhisperDec && hasQwen3Tok;
|
|
430
|
+
c.hasWhisper = hasWhisperEnc && hasWhisperDec && paths.joiner.empty() && !c.hasQwen3Asr;
|
|
408
431
|
bool hasFunAsrTok = !paths.funasrTokenizerDir.empty();
|
|
409
432
|
c.hasFunAsrNano = !paths.funasrEncoderAdaptor.empty() && !paths.funasrLLM.empty() &&
|
|
410
433
|
!paths.funasrEmbedding.empty() && hasFunAsrTok;
|
|
@@ -446,6 +469,7 @@ static void CollectDetectedModels(
|
|
|
446
469
|
out.push_back({"paraformer", modelDir});
|
|
447
470
|
}
|
|
448
471
|
if (cap.hasWhisper) out.push_back({"whisper", modelDir});
|
|
472
|
+
if (cap.hasQwen3Asr) out.push_back({"qwen3_asr", modelDir});
|
|
449
473
|
if (cap.hasFunAsrNano) out.push_back({"funasr_nano", modelDir});
|
|
450
474
|
if (cap.hasMoonshine) out.push_back({"moonshine", modelDir});
|
|
451
475
|
if (cap.hasMoonshineV2) out.push_back({"moonshine_v2", modelDir});
|
|
@@ -507,6 +531,10 @@ static SttModelKind ResolveSttKind(
|
|
|
507
531
|
outError = "FunASR Nano model requested but required files not found in " + modelDir;
|
|
508
532
|
return SttModelKind::kUnknown;
|
|
509
533
|
}
|
|
534
|
+
if (selected == SttModelKind::kQwen3Asr && !cap.hasQwen3Asr) {
|
|
535
|
+
outError = "Qwen3-ASR model requested but conv_frontend/encoder/decoder/tokenizer not found in " + modelDir;
|
|
536
|
+
return SttModelKind::kUnknown;
|
|
537
|
+
}
|
|
510
538
|
if (selected == SttModelKind::kMoonshine && !cap.hasMoonshine) {
|
|
511
539
|
outError = "Moonshine v1 model requested but preprocess/encode/uncached_decode/cached_decode not found in " + modelDir;
|
|
512
540
|
return SttModelKind::kUnknown;
|
|
@@ -573,7 +601,9 @@ static SttModelKind ResolveSttKind(
|
|
|
573
601
|
if (!paths.paraformerModel.empty()) return SttModelKind::kParaformer;
|
|
574
602
|
if (cap.hasCanary) return SttModelKind::kCanary;
|
|
575
603
|
if (cap.hasFireRedAsr) return SttModelKind::kFireRedAsr;
|
|
604
|
+
if (cap.hasQwen3Asr && hints.isLikelyQwen3Asr) return SttModelKind::kQwen3Asr;
|
|
576
605
|
if (cap.hasWhisper) return SttModelKind::kWhisper;
|
|
606
|
+
if (cap.hasQwen3Asr) return SttModelKind::kQwen3Asr;
|
|
577
607
|
if (cap.hasFunAsrNano) return SttModelKind::kFunAsrNano;
|
|
578
608
|
if (cap.hasMoonshineV2) return SttModelKind::kMoonshineV2;
|
|
579
609
|
if (cap.hasDolphin) return SttModelKind::kDolphin;
|
|
@@ -618,6 +648,12 @@ static void ApplyPathsForSttKind(SttModelKind kind, const SttCandidatePaths& can
|
|
|
618
648
|
resultPaths.funasrEmbedding = candidate.funasrEmbedding;
|
|
619
649
|
resultPaths.funasrTokenizer = candidate.funasrTokenizerDir;
|
|
620
650
|
break;
|
|
651
|
+
case SttModelKind::kQwen3Asr:
|
|
652
|
+
resultPaths.qwen3ConvFrontend = candidate.qwen3ConvFrontend;
|
|
653
|
+
resultPaths.qwen3Encoder = candidate.encoder;
|
|
654
|
+
resultPaths.qwen3Decoder = candidate.decoder;
|
|
655
|
+
resultPaths.qwen3Tokenizer = candidate.qwen3TokenizerDir;
|
|
656
|
+
break;
|
|
621
657
|
case SttModelKind::kMoonshine:
|
|
622
658
|
resultPaths.moonshinePreprocessor = candidate.moonshinePreprocessor;
|
|
623
659
|
resultPaths.moonshineEncoder = candidate.moonshineEncoder;
|
|
@@ -711,13 +747,13 @@ SttDetectResult DetectSttModel(
|
|
|
711
747
|
EmptyOrPath(candidate.encoder), EmptyOrPath(candidate.decoder));
|
|
712
748
|
LOGI("DetectSttModel: funasr encoderAdaptor=%s llm=%s embedding=%s tokenizerDir=%s",
|
|
713
749
|
EmptyOrPath(candidate.funasrEncoderAdaptor), EmptyOrPath(candidate.funasrLLM), EmptyOrPath(candidate.funasrEmbedding), EmptyOrPath(candidate.funasrTokenizerDir));
|
|
714
|
-
LOGI("DetectSttModel: hasTransducer=%d hasWhisper=%d hasMoonshine=%d hasMoonshineV2=%d hasParaformer=%d hasFunAsrNano=%d hasDolphin=%d hasFireRedAsr=%d hasFireRedCtc=%d hasCanary=%d hasOmnilingual=%d hasMedAsr=%d hasTeleSpeechCtc=%d hasToneCtc=%d",
|
|
750
|
+
LOGI("DetectSttModel: hasTransducer=%d hasWhisper=%d hasMoonshine=%d hasMoonshineV2=%d hasParaformer=%d hasFunAsrNano=%d hasQwen3Asr=%d hasDolphin=%d hasFireRedAsr=%d hasFireRedCtc=%d hasCanary=%d hasOmnilingual=%d hasMedAsr=%d hasTeleSpeechCtc=%d hasToneCtc=%d",
|
|
715
751
|
(int)cap.hasTransducer, (int)cap.hasWhisper, (int)cap.hasMoonshine, (int)cap.hasMoonshineV2,
|
|
716
|
-
(int)cap.hasParaformer, (int)cap.hasFunAsrNano, (int)cap.hasDolphin, (int)cap.hasFireRedAsr, (int)cap.hasFireRedCtc,
|
|
752
|
+
(int)cap.hasParaformer, (int)cap.hasFunAsrNano, (int)cap.hasQwen3Asr, (int)cap.hasDolphin, (int)cap.hasFireRedAsr, (int)cap.hasFireRedCtc,
|
|
717
753
|
(int)cap.hasCanary, (int)cap.hasOmnilingual, (int)cap.hasMedAsr, (int)cap.hasTeleSpeechCtc, (int)cap.hasToneCtc);
|
|
718
|
-
LOGI("DetectSttModel: hints isLikelyNemo=%d isLikelyTdt=%d isLikelyWenetCtc=%d isLikelySenseVoice=%d isLikelyFunAsrNano=%d isLikelyZipformer=%d isLikelyMoonshine=%d isLikelyDolphin=%d isLikelyFireRedAsr=%d isLikelyCanary=%d isLikelyOmnilingual=%d isLikelyMedAsr=%d isLikelyTeleSpeech=%d isLikelyToneCtc=%d isLikelyParaformer=%d isLikelyVad=%d isLikelyTdnn=%d",
|
|
754
|
+
LOGI("DetectSttModel: hints isLikelyNemo=%d isLikelyTdt=%d isLikelyWenetCtc=%d isLikelySenseVoice=%d isLikelyFunAsrNano=%d isLikelyQwen3Asr=%d isLikelyZipformer=%d isLikelyMoonshine=%d isLikelyDolphin=%d isLikelyFireRedAsr=%d isLikelyCanary=%d isLikelyOmnilingual=%d isLikelyMedAsr=%d isLikelyTeleSpeech=%d isLikelyToneCtc=%d isLikelyParaformer=%d isLikelyVad=%d isLikelyTdnn=%d",
|
|
719
755
|
(int)hints.isLikelyNemo, (int)hints.isLikelyTdt, (int)hints.isLikelyWenetCtc, (int)hints.isLikelySenseVoice,
|
|
720
|
-
(int)hints.isLikelyFunAsrNano, (int)hints.isLikelyZipformer, (int)hints.isLikelyMoonshine, (int)hints.isLikelyDolphin,
|
|
756
|
+
(int)hints.isLikelyFunAsrNano, (int)hints.isLikelyQwen3Asr, (int)hints.isLikelyZipformer, (int)hints.isLikelyMoonshine, (int)hints.isLikelyDolphin,
|
|
721
757
|
(int)hints.isLikelyFireRedAsr, (int)hints.isLikelyCanary, (int)hints.isLikelyOmnilingual, (int)hints.isLikelyMedAsr,
|
|
722
758
|
(int)hints.isLikelyTeleSpeech, (int)hints.isLikelyToneCtc, (int)hints.isLikelyParaformer, (int)hints.isLikelyVad, (int)hints.isLikelyTdnn);
|
|
723
759
|
}
|
|
@@ -747,7 +783,8 @@ SttDetectResult DetectSttModel(
|
|
|
747
783
|
}
|
|
748
784
|
|
|
749
785
|
LOGI("DetectSttModel: selected kind=%d (%s)", static_cast<int>(result.selectedKind), KindToName(result.selectedKind));
|
|
750
|
-
result.tokensRequired = (result.selectedKind != SttModelKind::kFunAsrNano
|
|
786
|
+
result.tokensRequired = (result.selectedKind != SttModelKind::kFunAsrNano &&
|
|
787
|
+
result.selectedKind != SttModelKind::kQwen3Asr);
|
|
751
788
|
ApplyPathsForSttKind(result.selectedKind, candidate, result.paths);
|
|
752
789
|
|
|
753
790
|
if (!candidate.tokens.empty() && FileExists(candidate.tokens)) {
|
|
@@ -808,6 +845,11 @@ SttDetectResult DetectSttModel(
|
|
|
808
845
|
EmptyOrPath(result.paths.funasrEncoderAdaptor), EmptyOrPath(result.paths.funasrLLM),
|
|
809
846
|
EmptyOrPath(result.paths.funasrEmbedding), EmptyOrPath(result.paths.funasrTokenizer));
|
|
810
847
|
break;
|
|
848
|
+
case SttModelKind::kQwen3Asr:
|
|
849
|
+
LOGI("DetectSttModel: paths set qwen3_asr conv=%s encoder=%s decoder=%s tokenizer=%s",
|
|
850
|
+
EmptyOrPath(result.paths.qwen3ConvFrontend), EmptyOrPath(result.paths.qwen3Encoder),
|
|
851
|
+
EmptyOrPath(result.paths.qwen3Decoder), EmptyOrPath(result.paths.qwen3Tokenizer));
|
|
852
|
+
break;
|
|
811
853
|
default:
|
|
812
854
|
break;
|
|
813
855
|
}
|
|
@@ -854,7 +896,8 @@ SttDetectResult DetectSttModelFromFileList(
|
|
|
854
896
|
return result;
|
|
855
897
|
}
|
|
856
898
|
|
|
857
|
-
result.tokensRequired = (result.selectedKind != SttModelKind::kFunAsrNano
|
|
899
|
+
result.tokensRequired = (result.selectedKind != SttModelKind::kFunAsrNano &&
|
|
900
|
+
result.selectedKind != SttModelKind::kQwen3Asr);
|
|
858
901
|
ApplyPathsForSttKind(result.selectedKind, candidate, result.paths);
|
|
859
902
|
|
|
860
903
|
result.paths.tokens = candidate.tokens;
|
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
* sherpa-onnx-model-detect-tts.cpp
|
|
3
3
|
*
|
|
4
4
|
* Purpose: Detects TTS model type and fills TtsModelPaths from a model directory. Used by
|
|
5
|
-
* nativeDetectTtsModel (module-jni). Supports Vits, Matcha, Kokoro, Kitten, Pocket, Zipvoice.
|
|
5
|
+
* nativeDetectTtsModel (module-jni). Supports Vits, Matcha, Kokoro, Kitten, Pocket, Zipvoice, Supertonic.
|
|
6
6
|
*
|
|
7
7
|
* --- Detection pipeline (overview) ---
|
|
8
8
|
*
|
|
@@ -56,6 +56,7 @@ TtsModelKind ParseTtsModelType(const std::string& modelType) {
|
|
|
56
56
|
if (modelType == "kitten") return TtsModelKind::kKitten;
|
|
57
57
|
if (modelType == "pocket") return TtsModelKind::kPocket;
|
|
58
58
|
if (modelType == "zipvoice") return TtsModelKind::kZipvoice;
|
|
59
|
+
if (modelType == "supertonic") return TtsModelKind::kSupertonic;
|
|
59
60
|
return TtsModelKind::kUnknown;
|
|
60
61
|
}
|
|
61
62
|
|
|
@@ -68,6 +69,7 @@ static bool CapabilitySupportsTtsKind(
|
|
|
68
69
|
bool hasMatcha,
|
|
69
70
|
bool hasPocket,
|
|
70
71
|
bool hasZipvoice,
|
|
72
|
+
bool hasSupertonic,
|
|
71
73
|
bool hasVoicesFile,
|
|
72
74
|
bool hasDataDir
|
|
73
75
|
) {
|
|
@@ -83,6 +85,8 @@ static bool CapabilitySupportsTtsKind(
|
|
|
83
85
|
return hasPocket;
|
|
84
86
|
case TtsModelKind::kZipvoice:
|
|
85
87
|
return hasZipvoice;
|
|
88
|
+
case TtsModelKind::kSupertonic:
|
|
89
|
+
return hasSupertonic;
|
|
86
90
|
default:
|
|
87
91
|
return false;
|
|
88
92
|
}
|
|
@@ -108,6 +112,7 @@ static std::vector<TtsModelKind> GetKindsFromDirNameTts(const std::string& model
|
|
|
108
112
|
if (lower.find("matcha") != std::string::npos) add(TtsModelKind::kMatcha);
|
|
109
113
|
if (lower.find("pocket") != std::string::npos) add(TtsModelKind::kPocket);
|
|
110
114
|
if (lower.find("zipvoice") != std::string::npos) add(TtsModelKind::kZipvoice);
|
|
115
|
+
if (lower.find("supertonic") != std::string::npos) add(TtsModelKind::kSupertonic);
|
|
111
116
|
if (lower.find("kokoro") != std::string::npos) add(TtsModelKind::kKokoro);
|
|
112
117
|
if (lower.find("kitten") != std::string::npos) add(TtsModelKind::kKitten);
|
|
113
118
|
if (lower.find("vits") != std::string::npos) add(TtsModelKind::kVits);
|
|
@@ -143,16 +148,27 @@ static TtsDetectResult DetectTtsModelFromFiles(
|
|
|
143
148
|
std::string textConditioner = FindOnnxByAnyToken(files, {"text_conditioner", "text-conditioner"}, std::nullopt);
|
|
144
149
|
std::string vocabJsonFile = FindFileByName(files, "vocab.json");
|
|
145
150
|
std::string tokenScoresJsonFile = FindFileByName(files, "token_scores.json");
|
|
151
|
+
std::string durationPredictor = FindOnnxByAnyToken(files, {"duration_predictor", "duration-predictor"}, std::nullopt);
|
|
152
|
+
std::string textEncoderSupertonic = FindOnnxByAnyToken(files, {"text_encoder", "text-encoder"}, std::nullopt);
|
|
153
|
+
std::string vectorEstimator = FindOnnxByAnyToken(files, {"vector_estimator", "vector-estimator"}, std::nullopt);
|
|
154
|
+
std::string ttsJsonFile = FindFileByName(files, "tts.json");
|
|
155
|
+
std::string unicodeIndexerFile = FindFileByName(files, "unicode_indexer.bin");
|
|
156
|
+
std::string voiceStyleFile = FindFileByName(files, "voice.bin");
|
|
146
157
|
|
|
147
158
|
std::vector<std::string> modelExcludes = {
|
|
148
|
-
"acoustic", "vocoder", "encoder", "decoder", "joiner"
|
|
159
|
+
"acoustic", "vocoder", "encoder", "decoder", "joiner",
|
|
160
|
+
// Supertonic component models are not VITS monolithic model.onnx files.
|
|
161
|
+
"duration_predictor", "duration-predictor",
|
|
162
|
+
"text_encoder", "text-encoder",
|
|
163
|
+
"vector_estimator", "vector-estimator"
|
|
149
164
|
};
|
|
150
165
|
std::string ttsModel = FindOnnxByAnyToken(files, {"model"}, std::nullopt);
|
|
151
166
|
if (ttsModel.empty()) {
|
|
152
167
|
ttsModel = FindLargestOnnxExcludingTokens(files, modelExcludes);
|
|
153
168
|
}
|
|
154
169
|
|
|
155
|
-
|
|
170
|
+
// VITS requires both model.onnx-like file and tokens.txt
|
|
171
|
+
bool hasVits = !ttsModel.empty() && !tokensFile.empty();
|
|
156
172
|
std::string modelDirLower = ToLower(modelDir);
|
|
157
173
|
bool isLikelyMatcha = modelDirLower.find("matcha") != std::string::npos;
|
|
158
174
|
bool hasMatcha = (!acousticModel.empty() && !vocoder.empty())
|
|
@@ -169,6 +185,9 @@ static TtsDetectResult DetectTtsModelFromFiles(
|
|
|
169
185
|
}
|
|
170
186
|
bool hasPocket = !lmFlow.empty() && !lmMain.empty() && !encoder.empty() && !decoder.empty() &&
|
|
171
187
|
!textConditioner.empty() && !vocabJsonFile.empty() && !tokenScoresJsonFile.empty();
|
|
188
|
+
bool hasSupertonic = !durationPredictor.empty() && !textEncoderSupertonic.empty() &&
|
|
189
|
+
!vectorEstimator.empty() && !vocoder.empty() && !ttsJsonFile.empty() &&
|
|
190
|
+
!unicodeIndexerFile.empty() && !voiceStyleFile.empty();
|
|
172
191
|
bool hasDataDir = !dataDirPath.empty();
|
|
173
192
|
|
|
174
193
|
bool isLikelyKitten = modelDirLower.find("kitten") != std::string::npos;
|
|
@@ -177,6 +196,7 @@ static TtsDetectResult DetectTtsModelFromFiles(
|
|
|
177
196
|
if (hasMatcha) result.detectedModels.push_back({"matcha", modelDir});
|
|
178
197
|
if (hasPocket) result.detectedModels.push_back({"pocket", modelDir});
|
|
179
198
|
if (hasZipvoice && !hasMatcha) result.detectedModels.push_back({"zipvoice", modelDir});
|
|
199
|
+
if (hasSupertonic) result.detectedModels.push_back({"supertonic", modelDir});
|
|
180
200
|
if (hasVoicesFile) {
|
|
181
201
|
if (isLikelyKitten && !isLikelyKokoro) {
|
|
182
202
|
result.detectedModels.push_back({"kitten", modelDir});
|
|
@@ -205,7 +225,7 @@ static TtsDetectResult DetectTtsModelFromFiles(
|
|
|
205
225
|
std::vector<TtsModelKind> nameCandidates = GetKindsFromDirNameTts(modelDir);
|
|
206
226
|
if (!nameCandidates.empty()) {
|
|
207
227
|
for (TtsModelKind k : nameCandidates) {
|
|
208
|
-
if (CapabilitySupportsTtsKind(k, hasVits, hasMatcha, hasPocket, hasZipvoice,
|
|
228
|
+
if (CapabilitySupportsTtsKind(k, hasVits, hasMatcha, hasPocket, hasZipvoice, hasSupertonic,
|
|
209
229
|
hasVoicesFile, hasDataDir)) {
|
|
210
230
|
selected = k;
|
|
211
231
|
break;
|
|
@@ -216,6 +236,7 @@ static TtsDetectResult DetectTtsModelFromFiles(
|
|
|
216
236
|
if (hasMatcha) selected = TtsModelKind::kMatcha;
|
|
217
237
|
else if (hasPocket) selected = TtsModelKind::kPocket;
|
|
218
238
|
else if (hasZipvoice) selected = TtsModelKind::kZipvoice;
|
|
239
|
+
else if (hasSupertonic) selected = TtsModelKind::kSupertonic;
|
|
219
240
|
else if (hasVoicesFile) {
|
|
220
241
|
if (isLikelyKitten && !isLikelyKokoro) selected = TtsModelKind::kKitten;
|
|
221
242
|
else if (isLikelyKokoro && !isLikelyKitten) selected = TtsModelKind::kKokoro;
|
|
@@ -256,6 +277,12 @@ static TtsDetectResult DetectTtsModelFromFiles(
|
|
|
256
277
|
result.paths.textConditioner = textConditioner;
|
|
257
278
|
result.paths.vocabJson = vocabJsonFile;
|
|
258
279
|
result.paths.tokenScoresJson = tokenScoresJsonFile;
|
|
280
|
+
result.paths.durationPredictor = durationPredictor;
|
|
281
|
+
result.paths.textEncoder = textEncoderSupertonic;
|
|
282
|
+
result.paths.vectorEstimator = vectorEstimator;
|
|
283
|
+
result.paths.ttsJson = ttsJsonFile;
|
|
284
|
+
result.paths.unicodeIndexer = unicodeIndexerFile;
|
|
285
|
+
result.paths.voiceStyle = voiceStyleFile;
|
|
259
286
|
|
|
260
287
|
auto validation = ValidateTtsPaths(selected, result.paths, modelDir);
|
|
261
288
|
if (!validation.ok) {
|
|
@@ -20,6 +20,7 @@ enum class SttModelKind {
|
|
|
20
20
|
kZipformerCtc,
|
|
21
21
|
kWhisper,
|
|
22
22
|
kFunAsrNano,
|
|
23
|
+
kQwen3Asr,
|
|
23
24
|
kFireRedAsr,
|
|
24
25
|
kMoonshine,
|
|
25
26
|
kMoonshineV2,
|
|
@@ -38,7 +39,8 @@ enum class TtsModelKind {
|
|
|
38
39
|
kKokoro,
|
|
39
40
|
kKitten,
|
|
40
41
|
kPocket,
|
|
41
|
-
kZipvoice
|
|
42
|
+
kZipvoice,
|
|
43
|
+
kSupertonic
|
|
42
44
|
};
|
|
43
45
|
|
|
44
46
|
struct SttModelPaths {
|
|
@@ -56,6 +58,11 @@ struct SttModelPaths {
|
|
|
56
58
|
std::string funasrLLM;
|
|
57
59
|
std::string funasrEmbedding;
|
|
58
60
|
std::string funasrTokenizer;
|
|
61
|
+
/** Qwen3-ASR: conv_frontend + encoder + decoder + tokenizer directory. */
|
|
62
|
+
std::string qwen3ConvFrontend;
|
|
63
|
+
std::string qwen3Encoder;
|
|
64
|
+
std::string qwen3Decoder;
|
|
65
|
+
std::string qwen3Tokenizer;
|
|
59
66
|
// Moonshine
|
|
60
67
|
std::string moonshinePreprocessor;
|
|
61
68
|
std::string moonshineEncoder;
|
|
@@ -88,6 +95,8 @@ struct SttCandidatePaths {
|
|
|
88
95
|
std::string funasrLLM;
|
|
89
96
|
std::string funasrEmbedding;
|
|
90
97
|
std::string funasrTokenizerDir;
|
|
98
|
+
std::string qwen3ConvFrontend;
|
|
99
|
+
std::string qwen3TokenizerDir;
|
|
91
100
|
std::string moonshinePreprocessor;
|
|
92
101
|
std::string moonshineEncoder;
|
|
93
102
|
std::string moonshineUncachedDecoder;
|
|
@@ -103,6 +112,7 @@ struct SttPathHints {
|
|
|
103
112
|
bool isLikelyWenetCtc = false;
|
|
104
113
|
bool isLikelySenseVoice = false;
|
|
105
114
|
bool isLikelyFunAsrNano = false;
|
|
115
|
+
bool isLikelyQwen3Asr = false;
|
|
106
116
|
bool isLikelyZipformer = false;
|
|
107
117
|
bool isLikelyMoonshine = false;
|
|
108
118
|
bool isLikelyDolphin = false;
|
|
@@ -127,6 +137,7 @@ struct SttCapabilities {
|
|
|
127
137
|
bool hasMoonshineV2 = false;
|
|
128
138
|
bool hasParaformer = false;
|
|
129
139
|
bool hasFunAsrNano = false;
|
|
140
|
+
bool hasQwen3Asr = false;
|
|
130
141
|
bool hasDolphin = false;
|
|
131
142
|
bool hasFireRedAsr = false;
|
|
132
143
|
/** True when dir name suggests Fire Red but only a single CTC/paraformer model (no encoder/decoder). Use zipformer_ctc. */
|
|
@@ -154,6 +165,13 @@ struct TtsModelPaths {
|
|
|
154
165
|
std::string textConditioner;
|
|
155
166
|
std::string vocabJson;
|
|
156
167
|
std::string tokenScoresJson;
|
|
168
|
+
// Supertonic TTS
|
|
169
|
+
std::string durationPredictor;
|
|
170
|
+
std::string textEncoder;
|
|
171
|
+
std::string vectorEstimator;
|
|
172
|
+
std::string ttsJson;
|
|
173
|
+
std::string unicodeIndexer;
|
|
174
|
+
std::string voiceStyle;
|
|
157
175
|
};
|
|
158
176
|
|
|
159
177
|
struct SttDetectResult {
|
|
@@ -23,6 +23,7 @@ const char* SttModelKindToString(SttModelKind k) {
|
|
|
23
23
|
case SttModelKind::kZipformerCtc: return "zipformer_ctc";
|
|
24
24
|
case SttModelKind::kWhisper: return "whisper";
|
|
25
25
|
case SttModelKind::kFunAsrNano: return "funasr_nano";
|
|
26
|
+
case SttModelKind::kQwen3Asr: return "qwen3_asr";
|
|
26
27
|
case SttModelKind::kFireRedAsr: return "fire_red_asr";
|
|
27
28
|
case SttModelKind::kMoonshine: return "moonshine";
|
|
28
29
|
case SttModelKind::kMoonshineV2: return "moonshine_v2";
|
|
@@ -79,6 +80,10 @@ jobject SttDetectResultToJava(JNIEnv* env, const SttDetectResult& result) {
|
|
|
79
80
|
PutString(env, pathsMap, mapPut, "funasrLLM", result.paths.funasrLLM);
|
|
80
81
|
PutString(env, pathsMap, mapPut, "funasrEmbedding", result.paths.funasrEmbedding);
|
|
81
82
|
PutString(env, pathsMap, mapPut, "funasrTokenizer", result.paths.funasrTokenizer);
|
|
83
|
+
PutString(env, pathsMap, mapPut, "qwen3ConvFrontend", result.paths.qwen3ConvFrontend);
|
|
84
|
+
PutString(env, pathsMap, mapPut, "qwen3Encoder", result.paths.qwen3Encoder);
|
|
85
|
+
PutString(env, pathsMap, mapPut, "qwen3Decoder", result.paths.qwen3Decoder);
|
|
86
|
+
PutString(env, pathsMap, mapPut, "qwen3Tokenizer", result.paths.qwen3Tokenizer);
|
|
82
87
|
PutString(env, pathsMap, mapPut, "moonshinePreprocessor", result.paths.moonshinePreprocessor);
|
|
83
88
|
PutString(env, pathsMap, mapPut, "moonshineEncoder", result.paths.moonshineEncoder);
|
|
84
89
|
PutString(env, pathsMap, mapPut, "moonshineUncachedDecoder", result.paths.moonshineUncachedDecoder);
|