react-native-sherpa-onnx 0.3.7 → 0.3.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (118) hide show
  1. package/README.md +7 -2
  2. package/SherpaOnnx.podspec +4 -1
  3. package/android/prebuilt-download.gradle +23 -23
  4. package/android/src/main/assets/model_licenses/asr-models-license-status.csv +1 -0
  5. package/android/src/main/cpp/jni/model_detect/sherpa-onnx-model-detect-helper.cpp +23 -0
  6. package/android/src/main/cpp/jni/model_detect/sherpa-onnx-model-detect-helper.h +9 -0
  7. package/android/src/main/cpp/jni/model_detect/sherpa-onnx-model-detect-stt.cpp +51 -8
  8. package/android/src/main/cpp/jni/model_detect/sherpa-onnx-model-detect-tts.cpp +31 -4
  9. package/android/src/main/cpp/jni/model_detect/sherpa-onnx-model-detect.h +19 -1
  10. package/android/src/main/cpp/jni/model_detect/sherpa-onnx-stt-wrapper.cpp +5 -0
  11. package/android/src/main/cpp/jni/model_detect/sherpa-onnx-tts-wrapper.cpp +7 -0
  12. package/android/src/main/cpp/jni/model_detect/sherpa-onnx-validate-stt.cpp +11 -0
  13. package/android/src/main/cpp/jni/model_detect/sherpa-onnx-validate-tts.cpp +14 -0
  14. package/android/src/main/java/com/sherpaonnx/SherpaOnnxArchiveHelper.kt +110 -35
  15. package/android/src/main/java/com/sherpaonnx/SherpaOnnxExtractionNotificationHelper.kt +102 -0
  16. package/android/src/main/java/com/sherpaonnx/SherpaOnnxModule.kt +92 -18
  17. package/android/src/main/java/com/sherpaonnx/SherpaOnnxSttHelper.kt +22 -0
  18. package/android/src/main/java/com/sherpaonnx/SherpaOnnxTtsHelper.kt +15 -0
  19. package/ios/Resources/model_licenses/asr-models-license-status.csv +1 -0
  20. package/ios/SherpaOnnx+STT.mm +13 -1
  21. package/ios/SherpaOnnx+TTS.mm +1 -0
  22. package/ios/SherpaOnnx.mm +87 -17
  23. package/ios/model_detect/sherpa-onnx-model-detect-helper.h +5 -0
  24. package/ios/model_detect/sherpa-onnx-model-detect-helper.mm +23 -0
  25. package/ios/model_detect/sherpa-onnx-model-detect-stt.mm +51 -7
  26. package/ios/model_detect/sherpa-onnx-model-detect-tts.mm +36 -4
  27. package/ios/model_detect/sherpa-onnx-model-detect.h +19 -1
  28. package/ios/model_detect/sherpa-onnx-validate-stt.mm +11 -0
  29. package/ios/model_detect/sherpa-onnx-validate-tts.mm +14 -0
  30. package/ios/stt/sherpa-onnx-stt-wrapper.h +11 -1
  31. package/ios/stt/sherpa-onnx-stt-wrapper.mm +30 -2
  32. package/ios/tts/sherpa-onnx-tts-wrapper.mm +25 -0
  33. package/lib/module/NativeSherpaOnnx.js.map +1 -1
  34. package/lib/module/download/ModelDownloadManager.js +1 -1
  35. package/lib/module/download/ModelDownloadManager.js.map +1 -1
  36. package/lib/module/download/background-downloader-types.js +2 -0
  37. package/lib/module/download/background-downloader-types.js.map +1 -0
  38. package/lib/module/download/downloadTask.js +54 -1
  39. package/lib/module/download/downloadTask.js.map +1 -1
  40. package/lib/module/download/index.js +1 -1
  41. package/lib/module/download/index.js.map +1 -1
  42. package/lib/module/download/postDownloadProcessing.js +17 -4
  43. package/lib/module/download/postDownloadProcessing.js.map +1 -1
  44. package/lib/module/download/registry.js +1 -0
  45. package/lib/module/download/registry.js.map +1 -1
  46. package/lib/module/extraction/extractTarBz2.js +2 -2
  47. package/lib/module/extraction/extractTarBz2.js.map +1 -1
  48. package/lib/module/extraction/extractTarZst.js +2 -2
  49. package/lib/module/extraction/extractTarZst.js.map +1 -1
  50. package/lib/module/extraction/index.js +10 -5
  51. package/lib/module/extraction/index.js.map +1 -1
  52. package/lib/module/stt/index.js +4 -2
  53. package/lib/module/stt/index.js.map +1 -1
  54. package/lib/module/stt/streaming.js +2 -1
  55. package/lib/module/stt/streaming.js.map +1 -1
  56. package/lib/module/stt/types.js +3 -1
  57. package/lib/module/stt/types.js.map +1 -1
  58. package/lib/module/tts/index.js +5 -3
  59. package/lib/module/tts/index.js.map +1 -1
  60. package/lib/module/tts/streaming.js +4 -2
  61. package/lib/module/tts/streaming.js.map +1 -1
  62. package/lib/module/tts/types.js +4 -1
  63. package/lib/module/tts/types.js.map +1 -1
  64. package/lib/typescript/src/NativeSherpaOnnx.d.ts +26 -10
  65. package/lib/typescript/src/NativeSherpaOnnx.d.ts.map +1 -1
  66. package/lib/typescript/src/download/ModelDownloadManager.d.ts +2 -1
  67. package/lib/typescript/src/download/ModelDownloadManager.d.ts.map +1 -1
  68. package/lib/typescript/src/download/background-downloader-types.d.ts +64 -0
  69. package/lib/typescript/src/download/background-downloader-types.d.ts.map +1 -0
  70. package/lib/typescript/src/download/downloadTask.d.ts +10 -0
  71. package/lib/typescript/src/download/downloadTask.d.ts.map +1 -1
  72. package/lib/typescript/src/download/index.d.ts +2 -2
  73. package/lib/typescript/src/download/index.d.ts.map +1 -1
  74. package/lib/typescript/src/download/postDownloadProcessing.d.ts +9 -0
  75. package/lib/typescript/src/download/postDownloadProcessing.d.ts.map +1 -1
  76. package/lib/typescript/src/download/registry.d.ts.map +1 -1
  77. package/lib/typescript/src/extraction/extractTarBz2.d.ts +2 -1
  78. package/lib/typescript/src/extraction/extractTarBz2.d.ts.map +1 -1
  79. package/lib/typescript/src/extraction/extractTarZst.d.ts +2 -1
  80. package/lib/typescript/src/extraction/extractTarZst.d.ts.map +1 -1
  81. package/lib/typescript/src/extraction/index.d.ts +1 -1
  82. package/lib/typescript/src/extraction/index.d.ts.map +1 -1
  83. package/lib/typescript/src/extraction/types.d.ts +12 -0
  84. package/lib/typescript/src/extraction/types.d.ts.map +1 -1
  85. package/lib/typescript/src/stt/index.d.ts +1 -1
  86. package/lib/typescript/src/stt/index.d.ts.map +1 -1
  87. package/lib/typescript/src/stt/streaming.d.ts.map +1 -1
  88. package/lib/typescript/src/stt/types.d.ts +16 -1
  89. package/lib/typescript/src/stt/types.d.ts.map +1 -1
  90. package/lib/typescript/src/tts/index.d.ts +1 -1
  91. package/lib/typescript/src/tts/index.d.ts.map +1 -1
  92. package/lib/typescript/src/tts/streaming.d.ts.map +1 -1
  93. package/lib/typescript/src/tts/types.d.ts +6 -1
  94. package/lib/typescript/src/tts/types.d.ts.map +1 -1
  95. package/package.json +1 -1
  96. package/scripts/ci/update_model_license_csv.sh +16 -16
  97. package/src/NativeSherpaOnnx.ts +38 -11
  98. package/src/download/ModelDownloadManager.ts +2 -0
  99. package/src/download/background-downloader-types.ts +73 -0
  100. package/src/download/downloadTask.ts +68 -0
  101. package/src/download/index.ts +2 -0
  102. package/src/download/postDownloadProcessing.ts +24 -1
  103. package/src/download/registry.ts +1 -0
  104. package/src/extraction/extractTarBz2.ts +7 -2
  105. package/src/extraction/extractTarZst.ts +7 -2
  106. package/src/extraction/index.ts +29 -6
  107. package/src/extraction/types.ts +16 -0
  108. package/src/stt/index.ts +8 -7
  109. package/src/stt/streaming.ts +7 -1
  110. package/src/stt/types.ts +18 -0
  111. package/src/tts/index.ts +10 -7
  112. package/src/tts/streaming.ts +8 -3
  113. package/src/tts/types.ts +9 -0
  114. package/third_party/sherpa-onnx-prebuilt/ANDROID_RELEASE_TAG +1 -1
  115. package/third_party/sherpa-onnx-prebuilt/IOS_RELEASE_TAG +1 -1
  116. package/lib/module/download/background-downloader.d.js +0 -2
  117. package/lib/module/download/background-downloader.d.js.map +0 -1
  118. package/src/download/background-downloader.d.ts +0 -43
package/README.md CHANGED
@@ -78,6 +78,8 @@ If you use the [download manager](docs/download-manager.md) to fetch models at r
78
78
 
79
79
  Full step-by-step: [Download manager – Setup (iOS & Android)](docs/download-manager.md#setup-ios--android). Expo users can use the library’s config plugin to apply this automatically.
80
80
 
81
+ **Android:** Foreground service permissions (Play Console), visible download notifications, and **`POST_NOTIFICATIONS` (API 33+)** are covered in [Download manager – Android: foreground service & notifications](docs/download-manager.md#android-foreground-service--notifications).
82
+
81
83
  ## Table of contents
82
84
 
83
85
  - [Bundled sherpa-onnx version](#bundled-sherpa-onnx-version)
@@ -124,8 +126,8 @@ Full step-by-step: [Download manager – Setup (iOS & Android)](docs/download-ma
124
126
  | Model quantization | ✅ **Supported** | [Model setup](./docs/model-setup.md) | Automatic detection and preference for quantized (int8) models. |
125
127
  | Flexible model loading | ✅ **Supported** | [Model setup](./docs/model-setup.md) | Asset models, file system models, or auto-detection. |
126
128
  | TypeScript | ✅ **Supported** | — | Full type definitions included. |
127
- | Speaker Diarization | ❌ Not yet supported | [Diarization](./docs/diarization.md) | Scheduled for release 0.4.0 |
128
- | Speech Enhancement | ❌ Not yet supported | [Enhancement](./docs/enhancement.md) | Scheduled for release 0.5.0 |
129
+ | Speech Enhancement | ❌ Not yet supported | [Enhancement](./docs/enhancement.md) | Scheduled for release 0.4.0 |
130
+ | Speaker Diarization | ❌ Not yet supported | [Diarization](./docs/diarization.md) | Scheduled for release 0.5.0 |
129
131
  | Source Separation | ❌ Not yet supported | [Separation](./docs/separation.md) | Scheduled for release 0.6.0 |
130
132
  | VAD (Voice Activity Detection) | ❌ Not yet supported | [VAD](./docs/vad.md) | Scheduled for release 0.7.0 |
131
133
 
@@ -146,6 +148,7 @@ Full step-by-step: [Download manager – Setup (iOS & Android)](docs/download-ma
146
148
 
147
149
  | Model Type | `modelType` Value | Description | Download Links |
148
150
  | ------------------------ | ----------------- | ---------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------ |
151
+ | **Auto Detect** | `'auto'` | Automatically detects model layout/type from files in the model folder and picks the best supported STT type. | n/a |
149
152
  | **Zipformer/Transducer** | `'transducer'` | Encoder–decoder–joiner (e.g. icefall). Good balance of speed and accuracy. Folder name should contain **zipformer** or **transducer** for auto-detection. | [Download](https://k2-fsa.github.io/sherpa/onnx/pretrained_models/offline-transducer/index.html) |
150
153
  | **LSTM Transducer** | `'transducer'` | Same layout as Zipformer (encoder–decoder–joiner). LSTM-based streaming ASR; detected as transducer. Folder name may contain **lstm**. | [Download](https://k2-fsa.github.io/sherpa/onnx/pretrained_models/online-transducer/lstm-transducer-models.html) |
151
154
  | **Paraformer** | `'paraformer'` | Single-model non-autoregressive ASR; fast and accurate. Detected by `model.onnx`; no folder token required. | [Download](https://k2-fsa.github.io/sherpa/onnx/pretrained_models/offline-paraformer/index.html) |
@@ -170,12 +173,14 @@ For **real-time (streaming) recognition** from a microphone or audio stream, use
170
173
 
171
174
  | Model Type | `modelType` Value | Description | Download Links |
172
175
  | ---------------- | ----------------- | ---------------------------------------------------------------------------------------------------- | ----------------------------------------------------------------------------------- |
176
+ | **Auto Detect** | `'auto'` | Automatically detects the TTS model layout from files in the model folder and selects the matching supported type. | n/a |
173
177
  | **VITS** | `'vits'` | Fast, high-quality TTS (Piper, Coqui, MeloTTS, MMS). Folder name should contain **vits** if used with other voice models. | [Download](https://github.com/k2-fsa/sherpa-onnx/releases/tag/tts-models) |
174
178
  | **Matcha** | `'matcha'` | High-quality acoustic model + vocoder. Detected by acoustic_model + vocoder; no folder token required. | [Download](https://k2-fsa.github.io/sherpa/onnx/tts/pretrained_models/matcha.html) |
175
179
  | **Kokoro** | `'kokoro'` | Multi-speaker, multi-language. Folder name should contain **kokoro** (not kitten) for auto-detection. | [Download](https://github.com/k2-fsa/sherpa-onnx/releases/tag/tts-models) |
176
180
  | **KittenTTS** | `'kitten'` | Lightweight, multi-speaker. Folder name should contain **kitten** (not kokoro) for auto-detection. | [Download](https://github.com/k2-fsa/sherpa-onnx/releases/tag/tts-models) |
177
181
  | **Zipvoice** | `'zipvoice'` | Standard TTS with **`sid`**. **Voice cloning** (reference audio + `referenceText`): batch via **`generateSpeech`** only—streaming TTS does not support reference audio for Zipvoice. Default **`numSteps`** when omitted is **5** on **Android and iOS** (matches sherpa-onnx `GenerationConfig` / Kotlin helper). Cloning is **supported on Android & iOS**. Encoder + decoder + vocoder. | [Download](https://k2-fsa.github.io/sherpa/onnx/tts/pretrained_models/zipvoice.html) |
178
182
  | **Pocket** | `'pocket'` | Flow-matching TTS. **Voice cloning** on **Android:** batch and streaming TTS. **iOS:** cloning is experimental. Detected by lm_flow, lm_main, text_conditioner, vocab/token_scores. | [Download](https://github.com/k2-fsa/sherpa-onnx/releases/tag/tts-models) |
183
+ | **Supertonic** | `'supertonic'` | Lightning-fast, on-device text-to-speech system designed for extreme performance with minimal computational overhead. | [Download](https://github.com/k2-fsa/sherpa-onnx/releases/tag/tts-models) |
179
184
 
180
185
  For **streaming TTS** (incremental generation, low latency), use `createStreamingTTS()` with supported model types. See [Streaming Text-to-Speech](./docs/tts-streaming.md).
181
186
 
@@ -140,7 +140,10 @@ Pod::Spec.new do |s|
140
140
  s.libraries = "c++", "z", "iconv", "bz2"
141
141
 
142
142
  # Per-release-model license metadata (synced from CI; same CSV as android/src/main/assets/model_licenses/).
143
- s.resources = ["ios/Resources/model_licenses/*.csv"]
143
+ # Use resource_bundles so assets are packaged reliably across CocoaPods integration modes.
144
+ s.resource_bundles = {
145
+ "SherpaOnnxResources" => ["ios/Resources/model_licenses/*.csv"]
146
+ }
144
147
 
145
148
  install_modules_dependencies(s)
146
149
  end
@@ -222,14 +222,14 @@ project.tasks.register("downloadNativeLibsIfNeeded") {
222
222
  sherpaVersionFile.text = currentSherpaVersion
223
223
  sherpaResolved = true
224
224
  println "[sherpa-onnx] jniLibs (*.so per ABI) .............. MAVEN_AAR ${aar.name}"
225
- println "[sherpa-onnx] install: jni/<abi>/*.so ${jniLibsHuman}/<abi>/"
226
- println "[sherpa-onnx] C headers (sherpa-onnx) ............. ${copiedHeaders ? 'MAVEN_AAR c-api/** ' + includeSherpaDir : 'unchanged (no c-api/ in AAR; existing tree kept)'}"
225
+ println "[sherpa-onnx] install: jni/<abi>/*.so --> ${jniLibsHuman}/<abi>/"
226
+ println "[sherpa-onnx] C headers (sherpa-onnx) ............. ${copiedHeaders ? 'MAVEN_AAR c-api/** --> ' + includeSherpaDir : 'unchanged (no c-api/ in AAR; existing tree kept)'}"
227
227
  println "[sherpa-onnx] version stamp ...................... written ${sherpaVersionFile.name}=${currentSherpaVersion}"
228
228
  } else {
229
- println "[sherpa-onnx] MAVEN_AAR: sherpaOnnxAar empty trying GITHUB_RELEASE"
229
+ println "[sherpa-onnx] MAVEN_AAR: sherpaOnnxAar empty --> trying GITHUB_RELEASE"
230
230
  }
231
231
  } catch (Exception e) {
232
- println "[sherpa-onnx] MAVEN_AAR failed: ${e.message} trying GITHUB_RELEASE"
232
+ println "[sherpa-onnx] MAVEN_AAR failed: ${e.message} --> trying GITHUB_RELEASE"
233
233
  }
234
234
  }
235
235
  } else {
@@ -290,14 +290,14 @@ project.tasks.register("downloadNativeLibsIfNeeded") {
290
290
  ffmpegVersionFile.text = currentFfmpegVersion
291
291
  ffmpegResolved = true
292
292
  println "[FFmpeg] jniLibs .................................. MAVEN_AAR ${aar.name}"
293
- println "[FFmpeg] install: jni/<abi>/*.so ${jniLibsHuman}/<abi>/"
294
- println "[FFmpeg] C headers ................................ ${copiedHdr ? 'MAVEN_AAR include/** ' + ffmpegIncludeDir : 'unchanged (no include/ in AAR)'}"
293
+ println "[FFmpeg] install: jni/<abi>/*.so --> ${jniLibsHuman}/<abi>/"
294
+ println "[FFmpeg] C headers ................................ ${copiedHdr ? 'MAVEN_AAR include/** --> ' + ffmpegIncludeDir : 'unchanged (no include/ in AAR)'}"
295
295
  println "[FFmpeg] version stamp ............................ written ${ffmpegVersionFile.name}=${currentFfmpegVersion}"
296
296
  } else {
297
- println "[FFmpeg] MAVEN_AAR: ffmpegAar empty trying GITHUB_RELEASE"
297
+ println "[FFmpeg] MAVEN_AAR: ffmpegAar empty --> trying GITHUB_RELEASE"
298
298
  }
299
299
  } catch (Exception e) {
300
- println "[FFmpeg] MAVEN_AAR failed: ${e.message} trying GITHUB_RELEASE"
300
+ println "[FFmpeg] MAVEN_AAR failed: ${e.message} --> trying GITHUB_RELEASE"
301
301
  }
302
302
  }
303
303
  } else {
@@ -358,14 +358,14 @@ project.tasks.register("downloadNativeLibsIfNeeded") {
358
358
  libarchiveVersionFile.text = currentLibarchiveVersion
359
359
  libarchiveResolved = true
360
360
  println "[libarchive] jniLibs .............................. MAVEN_AAR ${aar.name}"
361
- println "[libarchive] install: jni/<abi>/*.so ${jniLibsHuman}/<abi>/"
362
- println "[libarchive] C headers ............................ ${copiedHdr ? 'MAVEN_AAR include/** ' + libarchiveIncludeDir : 'unchanged (no include/ in AAR)'}"
361
+ println "[libarchive] install: jni/<abi>/*.so --> ${jniLibsHuman}/<abi>/"
362
+ println "[libarchive] C headers ............................ ${copiedHdr ? 'MAVEN_AAR include/** --> ' + libarchiveIncludeDir : 'unchanged (no include/ in AAR)'}"
363
363
  println "[libarchive] version stamp ........................ written ${libarchiveVersionFile.name}=${currentLibarchiveVersion}"
364
364
  } else {
365
- println "[libarchive] MAVEN_AAR: libarchiveAar empty trying GITHUB_RELEASE"
365
+ println "[libarchive] MAVEN_AAR: libarchiveAar empty --> trying GITHUB_RELEASE"
366
366
  }
367
367
  } catch (Exception e) {
368
- println "[libarchive] MAVEN_AAR failed: ${e.message} trying GITHUB_RELEASE"
368
+ println "[libarchive] MAVEN_AAR failed: ${e.message} --> trying GITHUB_RELEASE"
369
369
  }
370
370
  }
371
371
  } else {
@@ -415,7 +415,7 @@ project.tasks.register("downloadNativeLibsIfNeeded") {
415
415
  }
416
416
  ortJniResolved = true
417
417
  println "[onnxruntime] libonnxruntime4j_jni.so .......... MAVEN_AAR ${aar.name}"
418
- println "[onnxruntime] install: per ABI ${jniLibsHuman}/<abi>/ (only JNI bridge; libonnxruntime.so from sherpa prebuilts)"
418
+ println "[onnxruntime] install: per ABI --> ${jniLibsHuman}/<abi>/ (only JNI bridge; libonnxruntime.so from sherpa prebuilts)"
419
419
  } else {
420
420
  println "[onnxruntime] MAVEN_AAR: onnxruntimeAar empty — libonnxruntime4j_jni.so still missing"
421
421
  }
@@ -436,7 +436,7 @@ project.tasks.register("downloadNativeLibsIfNeeded") {
436
436
  def needLibarchive = !sherpaOnnxDisableLibarchive && !libarchiveResolved
437
437
  def needSherpa = !sherpaResolved
438
438
  if (needFfmpeg || needLibarchive || needSherpa) {
439
- println "[prebuilt] GITHUB_RELEASE: skipped (no repo). Set -PprebuiltGitHubRepo=owner/repo or git remote origin github.com"
439
+ println "[prebuilt] GITHUB_RELEASE: skipped (no repo). Set -PprebuiltGitHubRepo=owner/repo or git remote origin --> github.com"
440
440
  println "[prebuilt] still need: sherpa=${needSherpa}, ffmpeg=${needFfmpeg}, libarchive=${needLibarchive}"
441
441
  def diag = [
442
442
  "prebuiltGitHubRepo=${project.findProperty('prebuiltGitHubRepo') ?: '(not set)'}",
@@ -492,7 +492,7 @@ project.tasks.register("downloadNativeLibsIfNeeded") {
492
492
  ffmpegVersionFile.text = currentFfmpegVersion
493
493
  println "[FFmpeg] jniLibs + C headers ...................... GITHUB_RELEASE tag=${tag}"
494
494
  println "[FFmpeg] url: ${url}"
495
- println "[FFmpeg] install: <abi>/*.so ${jniLibsHuman}/"
495
+ println "[FFmpeg] install: <abi>/*.so --> ${jniLibsHuman}/"
496
496
  println "[FFmpeg] version stamp .......................... written ${ffmpegVersionFile.name}=${currentFfmpegVersion}"
497
497
  }
498
498
 
@@ -522,7 +522,7 @@ project.tasks.register("downloadNativeLibsIfNeeded") {
522
522
  libarchiveVersionFile.text = currentLibarchiveVersion
523
523
  println "[libarchive] jniLibs + C headers .................. GITHUB_RELEASE tag=${tag}"
524
524
  println "[libarchive] url: ${url}"
525
- println "[libarchive] install: <abi>/*.so ${jniLibsHuman}/"
525
+ println "[libarchive] install: <abi>/*.so --> ${jniLibsHuman}/"
526
526
  println "[libarchive] version stamp ........................ written ${libarchiveVersionFile.name}=${currentLibarchiveVersion}"
527
527
  }
528
528
 
@@ -560,8 +560,8 @@ project.tasks.register("downloadNativeLibsIfNeeded") {
560
560
  sherpaVersionFile.text = currentSherpaVersion
561
561
  println "[sherpa-onnx] jniLibs + C headers ................. GITHUB_RELEASE tag=${tag}"
562
562
  println "[sherpa-onnx] url: ${url}"
563
- println "[sherpa-onnx] install: <abi>/*.so ${jniLibsHuman}/"
564
- println "[sherpa-onnx] classes.jar ......................... ${sherpaJavaJar.exists() ? 'GITHUB_RELEASE ' + sherpaOnnxClassesDir : 'not in zip (use extractSherpaOnnxClasses)'}"
563
+ println "[sherpa-onnx] install: <abi>/*.so --> ${jniLibsHuman}/"
564
+ println "[sherpa-onnx] classes.jar ......................... ${sherpaJavaJar.exists() ? 'GITHUB_RELEASE --> ' + sherpaOnnxClassesDir : 'not in zip (use extractSherpaOnnxClasses)'}"
565
565
  println "[sherpa-onnx] version stamp ...................... written ${sherpaVersionFile.name}=${currentSherpaVersion}"
566
566
  }
567
567
  println ""
@@ -612,7 +612,7 @@ project.afterEvaluate {
612
612
  project.tasks.findByName('preBuild')?.dependsOn(project.tasks.findByName('checkJniLibs'))
613
613
  }
614
614
 
615
- // sherpa-onnx classes.jar: resolution order THIRD_PARTY MAVEN_AAR GITHUB_EXTRACT
615
+ // sherpa-onnx classes.jar: resolution order THIRD_PARTY --> MAVEN_AAR --> GITHUB_EXTRACT
616
616
  def sherpaLocalJar = file("${project.projectDir.parent}/third_party/sherpa-onnx-prebuilt/android/java/classes.jar")
617
617
  def sherpaExtractedJar = file("${project.buildDir}/prebuilt-downloads/sherpa-onnx-extract/java/classes.jar")
618
618
 
@@ -627,7 +627,7 @@ project.tasks.register("extractSherpaOnnxClasses") {
627
627
  copy { from sherpaLocalJar; into sherpaOnnxClassesDir }
628
628
  println "[prebuilt] extractSherpaOnnxClasses"
629
629
  println "[sherpa-onnx] classes.jar (Kotlin API) .......... THIRD_PARTY"
630
- println "[sherpa-onnx] ${sherpaLocalJar.absolutePath} ${sherpaOnnxClassesDir}"
630
+ println "[sherpa-onnx] ${sherpaLocalJar.absolutePath} --> ${sherpaOnnxClassesDir}"
631
631
  return
632
632
  }
633
633
  def aarFiles = project.configurations.sherpaOnnxAar.files
@@ -640,14 +640,14 @@ project.tasks.register("extractSherpaOnnxClasses") {
640
640
  }
641
641
  println "[prebuilt] extractSherpaOnnxClasses"
642
642
  println "[sherpa-onnx] classes.jar (Kotlin API) .......... MAVEN_AAR ${aar.name}"
643
- println "[sherpa-onnx] classes.jar ${sherpaOnnxClassesDir}"
643
+ println "[sherpa-onnx] classes.jar --> ${sherpaOnnxClassesDir}"
644
644
  return
645
645
  }
646
646
  if (sherpaExtractedJar.exists()) {
647
647
  copy { from sherpaExtractedJar; into sherpaOnnxClassesDir }
648
648
  println "[prebuilt] extractSherpaOnnxClasses"
649
649
  println "[sherpa-onnx] classes.jar (Kotlin API) .......... GITHUB_EXTRACT"
650
- println "[sherpa-onnx] ${sherpaExtractedJar.absolutePath} ${sherpaOnnxClassesDir}"
650
+ println "[sherpa-onnx] ${sherpaExtractedJar.absolutePath} --> ${sherpaOnnxClassesDir}"
651
651
  return
652
652
  }
653
653
  throw new RuntimeException(
@@ -678,7 +678,7 @@ project.tasks.register("extractOnnxruntimeClasses") {
678
678
  }
679
679
  println "[prebuilt] extractOnnxruntimeClasses"
680
680
  println "[onnxruntime] classes.jar (Java API) ........... MAVEN_AAR ${aar.name}"
681
- println "[onnxruntime] renamed to onnxruntime-classes.jar ${onnxruntimeClassesDir}"
681
+ println "[onnxruntime] renamed to onnxruntime-classes.jar --> ${onnxruntimeClassesDir}"
682
682
  return
683
683
  }
684
684
  throw new RuntimeException(
@@ -397,6 +397,7 @@ sherpa-onnx-rk3576-streaming-zipformer-en-2023-06-26.tar.bz2,apache-2.0,yes,high
397
397
  sherpa-onnx-rk3568-streaming-zipformer-en-2023-06-26.tar.bz2,apache-2.0,yes,high,manual,https://huggingface.co/csukuangfj/sherpa-onnx-streaming-zipformer-en-2023-06-26
398
398
  sherpa-onnx-rk3566-streaming-zipformer-en-2023-06-26.tar.bz2,apache-2.0,yes,high,manual,https://huggingface.co/csukuangfj/sherpa-onnx-streaming-zipformer-en-2023-06-26
399
399
  sherpa-onnx-rk3562-streaming-zipformer-en-2023-06-26.tar.bz2,apache-2.0,yes,high,manual,https://huggingface.co/csukuangfj/sherpa-onnx-streaming-zipformer-en-2023-06-26
400
+ sherpa-onnx-qwen3-asr-0.6B-int8-2026-03-25.tar.bz2,apache-2.0,yes,high,manual,https://huggingface.co/Qwen/Qwen3-ASR-0.6B
400
401
  sherpa-onnx-rk3588-streaming-zipformer-small-bilingual-zh-en-2023-02-16.tar.bz2,apache-2.0,yes,high,manual,https://huggingface.co/csukuangfj/k2fsa-zipformer-bilingual-zh-en-t
401
402
  sherpa-onnx-rk3588-streaming-zipformer-bilingual-zh-en-2023-02-20.tar.bz2,apache-2.0,yes,high,manual,https://huggingface.co/csukuangfj/k2fsa-zipformer-bilingual-zh-en-t
402
403
  sherpa-onnx-rk3576-streaming-zipformer-small-bilingual-zh-en-2023-02-16.tar.bz2,apache-2.0,yes,high,manual,https://huggingface.co/csukuangfj/k2fsa-zipformer-bilingual-zh-en-t
@@ -389,5 +389,28 @@ std::vector<LexiconCandidate> FindLexiconCandidates(
389
389
  return candidates;
390
390
  }
391
391
 
392
+ bool Qwen3TokenizerDirHasVocabAndMerges(
393
+ const std::vector<FileEntry>& files,
394
+ const std::string& dirRaw
395
+ ) {
396
+ std::string dir = dirRaw;
397
+ while (!dir.empty() && (dir.back() == '/' || dir.back() == '\\'))
398
+ dir.pop_back();
399
+ if (dir.empty()) return false;
400
+ bool hasVocab = false;
401
+ bool hasMerges = false;
402
+ const std::string prefix = dir + "/";
403
+ for (const auto& e : files) {
404
+ if (e.path.size() <= prefix.size()) continue;
405
+ if (e.path.compare(0, prefix.size(), prefix) != 0) continue;
406
+ std::string rest = e.path.substr(prefix.size());
407
+ if (rest.find('/') != std::string::npos || rest.find('\\') != std::string::npos) continue;
408
+ if (e.nameLower == "vocab.json") hasVocab = true;
409
+ if (e.nameLower == "merges.txt") hasMerges = true;
410
+ }
411
+ if (hasVocab && hasMerges) return true;
412
+ return FileExists(dir + "/vocab.json") && FileExists(dir + "/merges.txt");
413
+ }
414
+
392
415
  } // namespace model_detect
393
416
  } // namespace sherpaonnx
@@ -88,6 +88,15 @@ std::vector<LexiconCandidate> FindLexiconCandidates(
88
88
  const std::string& rootDir
89
89
  );
90
90
 
91
+ /**
92
+ * True if `dir` contains vocab.json and merges.txt: listed in `files` (fixture / synthetic trees)
93
+ * or present on disk. Used for Qwen3-ASR tokenizer directory detection.
94
+ */
95
+ bool Qwen3TokenizerDirHasVocabAndMerges(
96
+ const std::vector<FileEntry>& files,
97
+ const std::string& dir
98
+ );
99
+
91
100
  } // namespace model_detect
92
101
  } // namespace sherpaonnx
93
102
 
@@ -61,6 +61,7 @@ static const char* KindToName(SttModelKind k) {
61
61
  case SttModelKind::kZipformerCtc: return "zipformer_ctc";
62
62
  case SttModelKind::kWhisper: return "whisper";
63
63
  case SttModelKind::kFunAsrNano: return "funasr_nano";
64
+ case SttModelKind::kQwen3Asr: return "qwen3_asr";
64
65
  case SttModelKind::kFireRedAsr: return "fire_red_asr";
65
66
  case SttModelKind::kMoonshine: return "moonshine";
66
67
  case SttModelKind::kMoonshineV2: return "moonshine_v2";
@@ -88,6 +89,7 @@ SttModelKind ParseSttModelType(const std::string& modelType) {
88
89
  if (modelType == "zipformer_ctc" || modelType == "ctc") return SttModelKind::kZipformerCtc;
89
90
  if (modelType == "whisper") return SttModelKind::kWhisper;
90
91
  if (modelType == "funasr_nano") return SttModelKind::kFunAsrNano;
92
+ if (modelType == "qwen3_asr") return SttModelKind::kQwen3Asr;
91
93
  if (modelType == "fire_red_asr") return SttModelKind::kFireRedAsr;
92
94
  if (modelType == "moonshine") return SttModelKind::kMoonshine;
93
95
  if (modelType == "moonshine_v2") return SttModelKind::kMoonshineV2;
@@ -126,6 +128,8 @@ static bool CapabilitySupportsKind(
126
128
  return cap.hasWhisper;
127
129
  case SttModelKind::kFunAsrNano:
128
130
  return cap.hasFunAsrNano;
131
+ case SttModelKind::kQwen3Asr:
132
+ return cap.hasQwen3Asr;
129
133
  case SttModelKind::kFireRedAsr:
130
134
  return cap.hasFireRedAsr;
131
135
  case SttModelKind::kMoonshine:
@@ -189,6 +193,8 @@ static std::vector<SttModelKind> GetKindsFromDirName(const std::string& modelDir
189
193
  add(SttModelKind::kTransducer);
190
194
  add(SttModelKind::kZipformerCtc);
191
195
  }
196
+ if (lower.find("qwen3-asr") != std::string::npos || lower.find("qwen3_asr") != std::string::npos)
197
+ add(SttModelKind::kQwen3Asr);
192
198
  if (lower.find("funasr") != std::string::npos)
193
199
  add(SttModelKind::kFunAsrNano);
194
200
  if (lower.find("canary") != std::string::npos)
@@ -249,6 +255,19 @@ static SttCandidatePaths GatherSttCandidatePaths(
249
255
  p.funasrTokenizerDir = vocabInSubdir.substr(0, lastSlash);
250
256
  }
251
257
  }
258
+ p.qwen3ConvFrontend = FindOnnxByAnyToken(files, {"conv_frontend"}, preferInt8);
259
+ {
260
+ for (const auto& entry : files) {
261
+ if (entry.nameLower != "tokenizer_config.json") continue;
262
+ size_t slash = entry.path.find_last_of("/\\");
263
+ if (slash == std::string::npos) continue;
264
+ std::string dir = entry.path.substr(0, slash);
265
+ if (Qwen3TokenizerDirHasVocabAndMerges(files, dir)) {
266
+ p.qwen3TokenizerDir = dir;
267
+ break;
268
+ }
269
+ }
270
+ }
252
271
  p.moonshinePreprocessor = FindOnnxByAnyToken(files, {"preprocess", "preprocessor"}, preferInt8);
253
272
  p.moonshineEncoder = FindOnnxByAnyToken(files, {"encode", "encoder_model"}, preferInt8);
254
273
  p.moonshineUncachedDecoder = FindOnnxByAnyToken(files, {"uncached_decode", "uncached"}, preferInt8);
@@ -258,7 +277,8 @@ static SttCandidatePaths GatherSttCandidatePaths(
258
277
  static const std::vector<std::string> modelExcludes = {
259
278
  "encoder", "decoder", "joiner", "vocoder", "acoustic", "embedding", "llm",
260
279
  "encoder_adaptor", "encoder-adaptor", "encoder_model", "decoder_model",
261
- "merged_decoder", "decoder_model_merged", "preprocess", "encode", "uncached", "cached"
280
+ "merged_decoder", "decoder_model_merged", "preprocess", "encode", "uncached", "cached",
281
+ "conv_frontend"
262
282
  };
263
283
  p.paraformerModel = FindOnnxByAnyToken(files, {"model"}, preferInt8);
264
284
  if (!p.paraformerModel.empty()) {
@@ -302,6 +322,7 @@ static SttPathHints GetSttPathHints(const std::string& modelDir) {
302
322
  h.isLikelyWenetCtc = lower.find("wenet") != std::string::npos;
303
323
  h.isLikelySenseVoice = lower.find("sense") != std::string::npos || lower.find("sensevoice") != std::string::npos;
304
324
  h.isLikelyFunAsrNano = lower.find("funasr") != std::string::npos || lower.find("funasr-nano") != std::string::npos;
325
+ h.isLikelyQwen3Asr = lower.find("qwen3-asr") != std::string::npos || lower.find("qwen3_asr") != std::string::npos;
305
326
  h.isLikelyZipformer = lower.find("zipformer") != std::string::npos;
306
327
  h.isLikelyMoonshine = lower.find("moonshine") != std::string::npos;
307
328
  h.isLikelyDolphin = lower.find("dolphin") != std::string::npos;
@@ -404,7 +425,9 @@ static SttCapabilities ComputeSttCapabilities(const SttCandidatePaths& paths, co
404
425
  c.hasTransducer = !paths.encoder.empty() && !paths.decoder.empty() && !paths.joiner.empty();
405
426
  bool hasWhisperEnc = !paths.encoder.empty();
406
427
  bool hasWhisperDec = !paths.decoder.empty();
407
- c.hasWhisper = hasWhisperEnc && hasWhisperDec && paths.joiner.empty();
428
+ bool hasQwen3Tok = !paths.qwen3TokenizerDir.empty();
429
+ c.hasQwen3Asr = !paths.qwen3ConvFrontend.empty() && hasWhisperEnc && hasWhisperDec && hasQwen3Tok;
430
+ c.hasWhisper = hasWhisperEnc && hasWhisperDec && paths.joiner.empty() && !c.hasQwen3Asr;
408
431
  bool hasFunAsrTok = !paths.funasrTokenizerDir.empty();
409
432
  c.hasFunAsrNano = !paths.funasrEncoderAdaptor.empty() && !paths.funasrLLM.empty() &&
410
433
  !paths.funasrEmbedding.empty() && hasFunAsrTok;
@@ -446,6 +469,7 @@ static void CollectDetectedModels(
446
469
  out.push_back({"paraformer", modelDir});
447
470
  }
448
471
  if (cap.hasWhisper) out.push_back({"whisper", modelDir});
472
+ if (cap.hasQwen3Asr) out.push_back({"qwen3_asr", modelDir});
449
473
  if (cap.hasFunAsrNano) out.push_back({"funasr_nano", modelDir});
450
474
  if (cap.hasMoonshine) out.push_back({"moonshine", modelDir});
451
475
  if (cap.hasMoonshineV2) out.push_back({"moonshine_v2", modelDir});
@@ -507,6 +531,10 @@ static SttModelKind ResolveSttKind(
507
531
  outError = "FunASR Nano model requested but required files not found in " + modelDir;
508
532
  return SttModelKind::kUnknown;
509
533
  }
534
+ if (selected == SttModelKind::kQwen3Asr && !cap.hasQwen3Asr) {
535
+ outError = "Qwen3-ASR model requested but conv_frontend/encoder/decoder/tokenizer not found in " + modelDir;
536
+ return SttModelKind::kUnknown;
537
+ }
510
538
  if (selected == SttModelKind::kMoonshine && !cap.hasMoonshine) {
511
539
  outError = "Moonshine v1 model requested but preprocess/encode/uncached_decode/cached_decode not found in " + modelDir;
512
540
  return SttModelKind::kUnknown;
@@ -573,7 +601,9 @@ static SttModelKind ResolveSttKind(
573
601
  if (!paths.paraformerModel.empty()) return SttModelKind::kParaformer;
574
602
  if (cap.hasCanary) return SttModelKind::kCanary;
575
603
  if (cap.hasFireRedAsr) return SttModelKind::kFireRedAsr;
604
+ if (cap.hasQwen3Asr && hints.isLikelyQwen3Asr) return SttModelKind::kQwen3Asr;
576
605
  if (cap.hasWhisper) return SttModelKind::kWhisper;
606
+ if (cap.hasQwen3Asr) return SttModelKind::kQwen3Asr;
577
607
  if (cap.hasFunAsrNano) return SttModelKind::kFunAsrNano;
578
608
  if (cap.hasMoonshineV2) return SttModelKind::kMoonshineV2;
579
609
  if (cap.hasDolphin) return SttModelKind::kDolphin;
@@ -618,6 +648,12 @@ static void ApplyPathsForSttKind(SttModelKind kind, const SttCandidatePaths& can
618
648
  resultPaths.funasrEmbedding = candidate.funasrEmbedding;
619
649
  resultPaths.funasrTokenizer = candidate.funasrTokenizerDir;
620
650
  break;
651
+ case SttModelKind::kQwen3Asr:
652
+ resultPaths.qwen3ConvFrontend = candidate.qwen3ConvFrontend;
653
+ resultPaths.qwen3Encoder = candidate.encoder;
654
+ resultPaths.qwen3Decoder = candidate.decoder;
655
+ resultPaths.qwen3Tokenizer = candidate.qwen3TokenizerDir;
656
+ break;
621
657
  case SttModelKind::kMoonshine:
622
658
  resultPaths.moonshinePreprocessor = candidate.moonshinePreprocessor;
623
659
  resultPaths.moonshineEncoder = candidate.moonshineEncoder;
@@ -711,13 +747,13 @@ SttDetectResult DetectSttModel(
711
747
  EmptyOrPath(candidate.encoder), EmptyOrPath(candidate.decoder));
712
748
  LOGI("DetectSttModel: funasr encoderAdaptor=%s llm=%s embedding=%s tokenizerDir=%s",
713
749
  EmptyOrPath(candidate.funasrEncoderAdaptor), EmptyOrPath(candidate.funasrLLM), EmptyOrPath(candidate.funasrEmbedding), EmptyOrPath(candidate.funasrTokenizerDir));
714
- LOGI("DetectSttModel: hasTransducer=%d hasWhisper=%d hasMoonshine=%d hasMoonshineV2=%d hasParaformer=%d hasFunAsrNano=%d hasDolphin=%d hasFireRedAsr=%d hasFireRedCtc=%d hasCanary=%d hasOmnilingual=%d hasMedAsr=%d hasTeleSpeechCtc=%d hasToneCtc=%d",
750
+ LOGI("DetectSttModel: hasTransducer=%d hasWhisper=%d hasMoonshine=%d hasMoonshineV2=%d hasParaformer=%d hasFunAsrNano=%d hasQwen3Asr=%d hasDolphin=%d hasFireRedAsr=%d hasFireRedCtc=%d hasCanary=%d hasOmnilingual=%d hasMedAsr=%d hasTeleSpeechCtc=%d hasToneCtc=%d",
715
751
  (int)cap.hasTransducer, (int)cap.hasWhisper, (int)cap.hasMoonshine, (int)cap.hasMoonshineV2,
716
- (int)cap.hasParaformer, (int)cap.hasFunAsrNano, (int)cap.hasDolphin, (int)cap.hasFireRedAsr, (int)cap.hasFireRedCtc,
752
+ (int)cap.hasParaformer, (int)cap.hasFunAsrNano, (int)cap.hasQwen3Asr, (int)cap.hasDolphin, (int)cap.hasFireRedAsr, (int)cap.hasFireRedCtc,
717
753
  (int)cap.hasCanary, (int)cap.hasOmnilingual, (int)cap.hasMedAsr, (int)cap.hasTeleSpeechCtc, (int)cap.hasToneCtc);
718
- LOGI("DetectSttModel: hints isLikelyNemo=%d isLikelyTdt=%d isLikelyWenetCtc=%d isLikelySenseVoice=%d isLikelyFunAsrNano=%d isLikelyZipformer=%d isLikelyMoonshine=%d isLikelyDolphin=%d isLikelyFireRedAsr=%d isLikelyCanary=%d isLikelyOmnilingual=%d isLikelyMedAsr=%d isLikelyTeleSpeech=%d isLikelyToneCtc=%d isLikelyParaformer=%d isLikelyVad=%d isLikelyTdnn=%d",
754
+ LOGI("DetectSttModel: hints isLikelyNemo=%d isLikelyTdt=%d isLikelyWenetCtc=%d isLikelySenseVoice=%d isLikelyFunAsrNano=%d isLikelyQwen3Asr=%d isLikelyZipformer=%d isLikelyMoonshine=%d isLikelyDolphin=%d isLikelyFireRedAsr=%d isLikelyCanary=%d isLikelyOmnilingual=%d isLikelyMedAsr=%d isLikelyTeleSpeech=%d isLikelyToneCtc=%d isLikelyParaformer=%d isLikelyVad=%d isLikelyTdnn=%d",
719
755
  (int)hints.isLikelyNemo, (int)hints.isLikelyTdt, (int)hints.isLikelyWenetCtc, (int)hints.isLikelySenseVoice,
720
- (int)hints.isLikelyFunAsrNano, (int)hints.isLikelyZipformer, (int)hints.isLikelyMoonshine, (int)hints.isLikelyDolphin,
756
+ (int)hints.isLikelyFunAsrNano, (int)hints.isLikelyQwen3Asr, (int)hints.isLikelyZipformer, (int)hints.isLikelyMoonshine, (int)hints.isLikelyDolphin,
721
757
  (int)hints.isLikelyFireRedAsr, (int)hints.isLikelyCanary, (int)hints.isLikelyOmnilingual, (int)hints.isLikelyMedAsr,
722
758
  (int)hints.isLikelyTeleSpeech, (int)hints.isLikelyToneCtc, (int)hints.isLikelyParaformer, (int)hints.isLikelyVad, (int)hints.isLikelyTdnn);
723
759
  }
@@ -747,7 +783,8 @@ SttDetectResult DetectSttModel(
747
783
  }
748
784
 
749
785
  LOGI("DetectSttModel: selected kind=%d (%s)", static_cast<int>(result.selectedKind), KindToName(result.selectedKind));
750
- result.tokensRequired = (result.selectedKind != SttModelKind::kFunAsrNano);
786
+ result.tokensRequired = (result.selectedKind != SttModelKind::kFunAsrNano &&
787
+ result.selectedKind != SttModelKind::kQwen3Asr);
751
788
  ApplyPathsForSttKind(result.selectedKind, candidate, result.paths);
752
789
 
753
790
  if (!candidate.tokens.empty() && FileExists(candidate.tokens)) {
@@ -808,6 +845,11 @@ SttDetectResult DetectSttModel(
808
845
  EmptyOrPath(result.paths.funasrEncoderAdaptor), EmptyOrPath(result.paths.funasrLLM),
809
846
  EmptyOrPath(result.paths.funasrEmbedding), EmptyOrPath(result.paths.funasrTokenizer));
810
847
  break;
848
+ case SttModelKind::kQwen3Asr:
849
+ LOGI("DetectSttModel: paths set qwen3_asr conv=%s encoder=%s decoder=%s tokenizer=%s",
850
+ EmptyOrPath(result.paths.qwen3ConvFrontend), EmptyOrPath(result.paths.qwen3Encoder),
851
+ EmptyOrPath(result.paths.qwen3Decoder), EmptyOrPath(result.paths.qwen3Tokenizer));
852
+ break;
811
853
  default:
812
854
  break;
813
855
  }
@@ -854,7 +896,8 @@ SttDetectResult DetectSttModelFromFileList(
854
896
  return result;
855
897
  }
856
898
 
857
- result.tokensRequired = (result.selectedKind != SttModelKind::kFunAsrNano);
899
+ result.tokensRequired = (result.selectedKind != SttModelKind::kFunAsrNano &&
900
+ result.selectedKind != SttModelKind::kQwen3Asr);
858
901
  ApplyPathsForSttKind(result.selectedKind, candidate, result.paths);
859
902
 
860
903
  result.paths.tokens = candidate.tokens;
@@ -2,7 +2,7 @@
2
2
  * sherpa-onnx-model-detect-tts.cpp
3
3
  *
4
4
  * Purpose: Detects TTS model type and fills TtsModelPaths from a model directory. Used by
5
- * nativeDetectTtsModel (module-jni). Supports Vits, Matcha, Kokoro, Kitten, Pocket, Zipvoice.
5
+ * nativeDetectTtsModel (module-jni). Supports Vits, Matcha, Kokoro, Kitten, Pocket, Zipvoice, Supertonic.
6
6
  *
7
7
  * --- Detection pipeline (overview) ---
8
8
  *
@@ -56,6 +56,7 @@ TtsModelKind ParseTtsModelType(const std::string& modelType) {
56
56
  if (modelType == "kitten") return TtsModelKind::kKitten;
57
57
  if (modelType == "pocket") return TtsModelKind::kPocket;
58
58
  if (modelType == "zipvoice") return TtsModelKind::kZipvoice;
59
+ if (modelType == "supertonic") return TtsModelKind::kSupertonic;
59
60
  return TtsModelKind::kUnknown;
60
61
  }
61
62
 
@@ -68,6 +69,7 @@ static bool CapabilitySupportsTtsKind(
68
69
  bool hasMatcha,
69
70
  bool hasPocket,
70
71
  bool hasZipvoice,
72
+ bool hasSupertonic,
71
73
  bool hasVoicesFile,
72
74
  bool hasDataDir
73
75
  ) {
@@ -83,6 +85,8 @@ static bool CapabilitySupportsTtsKind(
83
85
  return hasPocket;
84
86
  case TtsModelKind::kZipvoice:
85
87
  return hasZipvoice;
88
+ case TtsModelKind::kSupertonic:
89
+ return hasSupertonic;
86
90
  default:
87
91
  return false;
88
92
  }
@@ -108,6 +112,7 @@ static std::vector<TtsModelKind> GetKindsFromDirNameTts(const std::string& model
108
112
  if (lower.find("matcha") != std::string::npos) add(TtsModelKind::kMatcha);
109
113
  if (lower.find("pocket") != std::string::npos) add(TtsModelKind::kPocket);
110
114
  if (lower.find("zipvoice") != std::string::npos) add(TtsModelKind::kZipvoice);
115
+ if (lower.find("supertonic") != std::string::npos) add(TtsModelKind::kSupertonic);
111
116
  if (lower.find("kokoro") != std::string::npos) add(TtsModelKind::kKokoro);
112
117
  if (lower.find("kitten") != std::string::npos) add(TtsModelKind::kKitten);
113
118
  if (lower.find("vits") != std::string::npos) add(TtsModelKind::kVits);
@@ -143,16 +148,27 @@ static TtsDetectResult DetectTtsModelFromFiles(
143
148
  std::string textConditioner = FindOnnxByAnyToken(files, {"text_conditioner", "text-conditioner"}, std::nullopt);
144
149
  std::string vocabJsonFile = FindFileByName(files, "vocab.json");
145
150
  std::string tokenScoresJsonFile = FindFileByName(files, "token_scores.json");
151
+ std::string durationPredictor = FindOnnxByAnyToken(files, {"duration_predictor", "duration-predictor"}, std::nullopt);
152
+ std::string textEncoderSupertonic = FindOnnxByAnyToken(files, {"text_encoder", "text-encoder"}, std::nullopt);
153
+ std::string vectorEstimator = FindOnnxByAnyToken(files, {"vector_estimator", "vector-estimator"}, std::nullopt);
154
+ std::string ttsJsonFile = FindFileByName(files, "tts.json");
155
+ std::string unicodeIndexerFile = FindFileByName(files, "unicode_indexer.bin");
156
+ std::string voiceStyleFile = FindFileByName(files, "voice.bin");
146
157
 
147
158
  std::vector<std::string> modelExcludes = {
148
- "acoustic", "vocoder", "encoder", "decoder", "joiner"
159
+ "acoustic", "vocoder", "encoder", "decoder", "joiner",
160
+ // Supertonic component models are not VITS monolithic model.onnx files.
161
+ "duration_predictor", "duration-predictor",
162
+ "text_encoder", "text-encoder",
163
+ "vector_estimator", "vector-estimator"
149
164
  };
150
165
  std::string ttsModel = FindOnnxByAnyToken(files, {"model"}, std::nullopt);
151
166
  if (ttsModel.empty()) {
152
167
  ttsModel = FindLargestOnnxExcludingTokens(files, modelExcludes);
153
168
  }
154
169
 
155
- bool hasVits = !ttsModel.empty();
170
+ // VITS requires both model.onnx-like file and tokens.txt
171
+ bool hasVits = !ttsModel.empty() && !tokensFile.empty();
156
172
  std::string modelDirLower = ToLower(modelDir);
157
173
  bool isLikelyMatcha = modelDirLower.find("matcha") != std::string::npos;
158
174
  bool hasMatcha = (!acousticModel.empty() && !vocoder.empty())
@@ -169,6 +185,9 @@ static TtsDetectResult DetectTtsModelFromFiles(
169
185
  }
170
186
  bool hasPocket = !lmFlow.empty() && !lmMain.empty() && !encoder.empty() && !decoder.empty() &&
171
187
  !textConditioner.empty() && !vocabJsonFile.empty() && !tokenScoresJsonFile.empty();
188
+ bool hasSupertonic = !durationPredictor.empty() && !textEncoderSupertonic.empty() &&
189
+ !vectorEstimator.empty() && !vocoder.empty() && !ttsJsonFile.empty() &&
190
+ !unicodeIndexerFile.empty() && !voiceStyleFile.empty();
172
191
  bool hasDataDir = !dataDirPath.empty();
173
192
 
174
193
  bool isLikelyKitten = modelDirLower.find("kitten") != std::string::npos;
@@ -177,6 +196,7 @@ static TtsDetectResult DetectTtsModelFromFiles(
177
196
  if (hasMatcha) result.detectedModels.push_back({"matcha", modelDir});
178
197
  if (hasPocket) result.detectedModels.push_back({"pocket", modelDir});
179
198
  if (hasZipvoice && !hasMatcha) result.detectedModels.push_back({"zipvoice", modelDir});
199
+ if (hasSupertonic) result.detectedModels.push_back({"supertonic", modelDir});
180
200
  if (hasVoicesFile) {
181
201
  if (isLikelyKitten && !isLikelyKokoro) {
182
202
  result.detectedModels.push_back({"kitten", modelDir});
@@ -205,7 +225,7 @@ static TtsDetectResult DetectTtsModelFromFiles(
205
225
  std::vector<TtsModelKind> nameCandidates = GetKindsFromDirNameTts(modelDir);
206
226
  if (!nameCandidates.empty()) {
207
227
  for (TtsModelKind k : nameCandidates) {
208
- if (CapabilitySupportsTtsKind(k, hasVits, hasMatcha, hasPocket, hasZipvoice,
228
+ if (CapabilitySupportsTtsKind(k, hasVits, hasMatcha, hasPocket, hasZipvoice, hasSupertonic,
209
229
  hasVoicesFile, hasDataDir)) {
210
230
  selected = k;
211
231
  break;
@@ -216,6 +236,7 @@ static TtsDetectResult DetectTtsModelFromFiles(
216
236
  if (hasMatcha) selected = TtsModelKind::kMatcha;
217
237
  else if (hasPocket) selected = TtsModelKind::kPocket;
218
238
  else if (hasZipvoice) selected = TtsModelKind::kZipvoice;
239
+ else if (hasSupertonic) selected = TtsModelKind::kSupertonic;
219
240
  else if (hasVoicesFile) {
220
241
  if (isLikelyKitten && !isLikelyKokoro) selected = TtsModelKind::kKitten;
221
242
  else if (isLikelyKokoro && !isLikelyKitten) selected = TtsModelKind::kKokoro;
@@ -256,6 +277,12 @@ static TtsDetectResult DetectTtsModelFromFiles(
256
277
  result.paths.textConditioner = textConditioner;
257
278
  result.paths.vocabJson = vocabJsonFile;
258
279
  result.paths.tokenScoresJson = tokenScoresJsonFile;
280
+ result.paths.durationPredictor = durationPredictor;
281
+ result.paths.textEncoder = textEncoderSupertonic;
282
+ result.paths.vectorEstimator = vectorEstimator;
283
+ result.paths.ttsJson = ttsJsonFile;
284
+ result.paths.unicodeIndexer = unicodeIndexerFile;
285
+ result.paths.voiceStyle = voiceStyleFile;
259
286
 
260
287
  auto validation = ValidateTtsPaths(selected, result.paths, modelDir);
261
288
  if (!validation.ok) {
@@ -20,6 +20,7 @@ enum class SttModelKind {
20
20
  kZipformerCtc,
21
21
  kWhisper,
22
22
  kFunAsrNano,
23
+ kQwen3Asr,
23
24
  kFireRedAsr,
24
25
  kMoonshine,
25
26
  kMoonshineV2,
@@ -38,7 +39,8 @@ enum class TtsModelKind {
38
39
  kKokoro,
39
40
  kKitten,
40
41
  kPocket,
41
- kZipvoice
42
+ kZipvoice,
43
+ kSupertonic
42
44
  };
43
45
 
44
46
  struct SttModelPaths {
@@ -56,6 +58,11 @@ struct SttModelPaths {
56
58
  std::string funasrLLM;
57
59
  std::string funasrEmbedding;
58
60
  std::string funasrTokenizer;
61
+ /** Qwen3-ASR: conv_frontend + encoder + decoder + tokenizer directory. */
62
+ std::string qwen3ConvFrontend;
63
+ std::string qwen3Encoder;
64
+ std::string qwen3Decoder;
65
+ std::string qwen3Tokenizer;
59
66
  // Moonshine
60
67
  std::string moonshinePreprocessor;
61
68
  std::string moonshineEncoder;
@@ -88,6 +95,8 @@ struct SttCandidatePaths {
88
95
  std::string funasrLLM;
89
96
  std::string funasrEmbedding;
90
97
  std::string funasrTokenizerDir;
98
+ std::string qwen3ConvFrontend;
99
+ std::string qwen3TokenizerDir;
91
100
  std::string moonshinePreprocessor;
92
101
  std::string moonshineEncoder;
93
102
  std::string moonshineUncachedDecoder;
@@ -103,6 +112,7 @@ struct SttPathHints {
103
112
  bool isLikelyWenetCtc = false;
104
113
  bool isLikelySenseVoice = false;
105
114
  bool isLikelyFunAsrNano = false;
115
+ bool isLikelyQwen3Asr = false;
106
116
  bool isLikelyZipformer = false;
107
117
  bool isLikelyMoonshine = false;
108
118
  bool isLikelyDolphin = false;
@@ -127,6 +137,7 @@ struct SttCapabilities {
127
137
  bool hasMoonshineV2 = false;
128
138
  bool hasParaformer = false;
129
139
  bool hasFunAsrNano = false;
140
+ bool hasQwen3Asr = false;
130
141
  bool hasDolphin = false;
131
142
  bool hasFireRedAsr = false;
132
143
  /** True when dir name suggests Fire Red but only a single CTC/paraformer model (no encoder/decoder). Use zipformer_ctc. */
@@ -154,6 +165,13 @@ struct TtsModelPaths {
154
165
  std::string textConditioner;
155
166
  std::string vocabJson;
156
167
  std::string tokenScoresJson;
168
+ // Supertonic TTS
169
+ std::string durationPredictor;
170
+ std::string textEncoder;
171
+ std::string vectorEstimator;
172
+ std::string ttsJson;
173
+ std::string unicodeIndexer;
174
+ std::string voiceStyle;
157
175
  };
158
176
 
159
177
  struct SttDetectResult {
@@ -23,6 +23,7 @@ const char* SttModelKindToString(SttModelKind k) {
23
23
  case SttModelKind::kZipformerCtc: return "zipformer_ctc";
24
24
  case SttModelKind::kWhisper: return "whisper";
25
25
  case SttModelKind::kFunAsrNano: return "funasr_nano";
26
+ case SttModelKind::kQwen3Asr: return "qwen3_asr";
26
27
  case SttModelKind::kFireRedAsr: return "fire_red_asr";
27
28
  case SttModelKind::kMoonshine: return "moonshine";
28
29
  case SttModelKind::kMoonshineV2: return "moonshine_v2";
@@ -79,6 +80,10 @@ jobject SttDetectResultToJava(JNIEnv* env, const SttDetectResult& result) {
79
80
  PutString(env, pathsMap, mapPut, "funasrLLM", result.paths.funasrLLM);
80
81
  PutString(env, pathsMap, mapPut, "funasrEmbedding", result.paths.funasrEmbedding);
81
82
  PutString(env, pathsMap, mapPut, "funasrTokenizer", result.paths.funasrTokenizer);
83
+ PutString(env, pathsMap, mapPut, "qwen3ConvFrontend", result.paths.qwen3ConvFrontend);
84
+ PutString(env, pathsMap, mapPut, "qwen3Encoder", result.paths.qwen3Encoder);
85
+ PutString(env, pathsMap, mapPut, "qwen3Decoder", result.paths.qwen3Decoder);
86
+ PutString(env, pathsMap, mapPut, "qwen3Tokenizer", result.paths.qwen3Tokenizer);
82
87
  PutString(env, pathsMap, mapPut, "moonshinePreprocessor", result.paths.moonshinePreprocessor);
83
88
  PutString(env, pathsMap, mapPut, "moonshineEncoder", result.paths.moonshineEncoder);
84
89
  PutString(env, pathsMap, mapPut, "moonshineUncachedDecoder", result.paths.moonshineUncachedDecoder);