react-native-sherpa-onnx 0.3.6 → 0.3.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (228) hide show
  1. package/LICENSE +1 -0
  2. package/README.md +92 -21
  3. package/SherpaOnnx.podspec +3 -0
  4. package/THIRD_PARTY_LICENSES/README.md +62 -0
  5. package/THIRD_PARTY_LICENSES/ffmpeg.txt +502 -0
  6. package/THIRD_PARTY_LICENSES/libarchive.txt +65 -0
  7. package/THIRD_PARTY_LICENSES/nvidia_omla.txt +181 -0
  8. package/THIRD_PARTY_LICENSES/onnxruntime.txt +21 -0
  9. package/THIRD_PARTY_LICENSES/opus.txt +44 -0
  10. package/THIRD_PARTY_LICENSES/sherpa-onnx.txt +201 -0
  11. package/THIRD_PARTY_LICENSES/shine.txt +482 -0
  12. package/THIRD_PARTY_LICENSES/zstd.txt +30 -0
  13. package/android/build.gradle +7 -3
  14. package/android/prebuilt-download.gradle +344 -152
  15. package/android/prebuilt-versions.gradle +1 -1
  16. package/android/src/main/assets/model_licenses/asr-models-license-status.csv +409 -0
  17. package/android/src/main/assets/model_licenses/qnn-asr-models-license-status.csv +695 -0
  18. package/android/src/main/assets/model_licenses/tts-models-license-status.csv +596 -0
  19. package/android/src/main/cpp/CMakeLists.txt +28 -10
  20. package/android/src/main/cpp/jni/archive/sherpa-onnx-archive-helper.cpp +2 -2
  21. package/android/src/main/cpp/jni/audio/sherpa-onnx-audio-convert-jni.cpp +268 -2
  22. package/android/src/main/cpp/jni/model_detect/sherpa-onnx-model-detect-tts.cpp +37 -6
  23. package/android/src/main/cpp/jni/model_detect/sherpa-onnx-model-detect.h +9 -1
  24. package/android/src/main/cpp/jni/model_detect/sherpa-onnx-tts-wrapper.cpp +7 -0
  25. package/android/src/main/cpp/jni/model_detect/sherpa-onnx-validate-tts.cpp +18 -2
  26. package/android/src/main/java/com/sherpaonnx/SherpaOnnxArchiveHelper.kt +40 -10
  27. package/android/src/main/java/com/sherpaonnx/SherpaOnnxModule.kt +99 -0
  28. package/android/src/main/java/com/sherpaonnx/SherpaOnnxOnlineSttHelper.kt +4 -1
  29. package/android/src/main/java/com/sherpaonnx/SherpaOnnxTtsHelper.kt +127 -97
  30. package/ios/Resources/model_licenses/asr-models-license-status.csv +409 -0
  31. package/ios/Resources/model_licenses/qnn-asr-models-license-status.csv +695 -0
  32. package/ios/Resources/model_licenses/tts-models-license-status.csv +596 -0
  33. package/ios/SherpaOnnx+OnlineSTT.mm +2 -0
  34. package/ios/SherpaOnnx+PcmLiveStream.mm +2 -29
  35. package/ios/SherpaOnnx+TTS.mm +179 -20
  36. package/ios/SherpaOnnx.mm +54 -0
  37. package/ios/SherpaOnnxAudioConvert.h +10 -0
  38. package/ios/SherpaOnnxAudioConvert.mm +257 -1
  39. package/ios/archive/sherpa-onnx-archive-helper.h +3 -0
  40. package/ios/archive/sherpa-onnx-archive-helper.mm +39 -6
  41. package/ios/model_detect/sherpa-onnx-model-detect-tts.mm +49 -6
  42. package/ios/model_detect/sherpa-onnx-model-detect.h +9 -1
  43. package/ios/model_detect/sherpa-onnx-validate-tts.mm +18 -2
  44. package/ios/online_stt/sherpa-onnx-online-stt-wrapper.h +1 -0
  45. package/ios/online_stt/sherpa-onnx-online-stt-wrapper.mm +4 -0
  46. package/ios/tts/sherpa-onnx-tts-wrapper.h +37 -0
  47. package/ios/tts/sherpa-onnx-tts-wrapper.mm +158 -3
  48. package/lib/module/NativeSherpaOnnx.js.map +1 -1
  49. package/lib/module/audio/index.js +8 -0
  50. package/lib/module/audio/index.js.map +1 -1
  51. package/lib/module/download/ModelDownloadManager.js +10 -929
  52. package/lib/module/download/ModelDownloadManager.js.map +1 -1
  53. package/lib/module/download/activeModelOperations.js +26 -0
  54. package/lib/module/download/activeModelOperations.js.map +1 -0
  55. package/lib/module/download/background-downloader-types.js +2 -0
  56. package/lib/module/download/background-downloader-types.js.map +1 -0
  57. package/lib/module/download/bulkPurge.js +72 -0
  58. package/lib/module/download/bulkPurge.js.map +1 -0
  59. package/lib/module/download/checksumPrompt.js +19 -0
  60. package/lib/module/download/checksumPrompt.js.map +1 -0
  61. package/lib/module/download/constants.js +7 -0
  62. package/lib/module/download/constants.js.map +1 -0
  63. package/lib/module/download/downloadEvents.js +35 -0
  64. package/lib/module/download/downloadEvents.js.map +1 -0
  65. package/lib/module/download/downloadTask.js +438 -0
  66. package/lib/module/download/downloadTask.js.map +1 -0
  67. package/lib/module/download/ensureModel.js +89 -0
  68. package/lib/module/download/ensureModel.js.map +1 -0
  69. package/lib/module/download/index.js +4 -4
  70. package/lib/module/download/index.js.map +1 -1
  71. package/lib/module/download/localModels.js +151 -0
  72. package/lib/module/download/localModels.js.map +1 -0
  73. package/lib/module/download/modelExtraction.js +174 -0
  74. package/lib/module/download/modelExtraction.js.map +1 -0
  75. package/lib/module/download/paths.js +98 -0
  76. package/lib/module/download/paths.js.map +1 -0
  77. package/lib/module/download/postDownloadProcessing.js +206 -0
  78. package/lib/module/download/postDownloadProcessing.js.map +1 -0
  79. package/lib/module/download/protectedModelKeys.js +31 -0
  80. package/lib/module/download/protectedModelKeys.js.map +1 -0
  81. package/lib/module/download/registry.js +268 -0
  82. package/lib/module/download/registry.js.map +1 -0
  83. package/lib/module/download/retry.js +59 -0
  84. package/lib/module/download/retry.js.map +1 -0
  85. package/lib/module/download/types.js +17 -0
  86. package/lib/module/download/types.js.map +1 -0
  87. package/lib/module/download/validation.js +101 -5
  88. package/lib/module/download/validation.js.map +1 -1
  89. package/lib/module/{download → extraction}/extractTarBz2.js +3 -1
  90. package/lib/module/extraction/extractTarBz2.js.map +1 -0
  91. package/lib/module/{download → extraction}/extractTarZst.js +3 -1
  92. package/lib/module/extraction/extractTarZst.js.map +1 -0
  93. package/lib/module/extraction/index.js +3 -4
  94. package/lib/module/extraction/index.js.map +1 -1
  95. package/lib/module/index.js +1 -1
  96. package/lib/module/index.js.map +1 -1
  97. package/lib/module/licenses.js +63 -0
  98. package/lib/module/licenses.js.map +1 -0
  99. package/lib/module/stt/index.js +16 -2
  100. package/lib/module/stt/index.js.map +1 -1
  101. package/lib/module/stt/streaming.js +2 -0
  102. package/lib/module/stt/streaming.js.map +1 -1
  103. package/lib/module/stt/streamingTypes.js.map +1 -1
  104. package/lib/module/stt/types.js.map +1 -1
  105. package/lib/module/tts/index.js +21 -3
  106. package/lib/module/tts/index.js.map +1 -1
  107. package/lib/module/tts/streaming.js +5 -1
  108. package/lib/module/tts/streaming.js.map +1 -1
  109. package/lib/module/tts/types.js +4 -1
  110. package/lib/module/tts/types.js.map +1 -1
  111. package/lib/module/utils.js +16 -1
  112. package/lib/module/utils.js.map +1 -1
  113. package/lib/typescript/src/NativeSherpaOnnx.d.ts +34 -6
  114. package/lib/typescript/src/NativeSherpaOnnx.d.ts.map +1 -1
  115. package/lib/typescript/src/audio/index.d.ts +10 -0
  116. package/lib/typescript/src/audio/index.d.ts.map +1 -1
  117. package/lib/typescript/src/download/ModelDownloadManager.d.ts +11 -108
  118. package/lib/typescript/src/download/ModelDownloadManager.d.ts.map +1 -1
  119. package/lib/typescript/src/download/activeModelOperations.d.ts +6 -0
  120. package/lib/typescript/src/download/activeModelOperations.d.ts.map +1 -0
  121. package/lib/typescript/src/download/background-downloader-types.d.ts +64 -0
  122. package/lib/typescript/src/download/background-downloader-types.d.ts.map +1 -0
  123. package/lib/typescript/src/download/bulkPurge.d.ts +14 -0
  124. package/lib/typescript/src/download/bulkPurge.d.ts.map +1 -0
  125. package/lib/typescript/src/download/checksumPrompt.d.ts +3 -0
  126. package/lib/typescript/src/download/checksumPrompt.d.ts.map +1 -0
  127. package/lib/typescript/src/download/constants.d.ts +5 -0
  128. package/lib/typescript/src/download/constants.d.ts.map +1 -0
  129. package/lib/typescript/src/download/downloadEvents.d.ts +6 -0
  130. package/lib/typescript/src/download/downloadEvents.d.ts.map +1 -0
  131. package/lib/typescript/src/download/downloadTask.d.ts +30 -0
  132. package/lib/typescript/src/download/downloadTask.d.ts.map +1 -0
  133. package/lib/typescript/src/download/ensureModel.d.ts +26 -0
  134. package/lib/typescript/src/download/ensureModel.d.ts.map +1 -0
  135. package/lib/typescript/src/download/index.d.ts +7 -7
  136. package/lib/typescript/src/download/index.d.ts.map +1 -1
  137. package/lib/typescript/src/download/localModels.d.ts +15 -0
  138. package/lib/typescript/src/download/localModels.d.ts.map +1 -0
  139. package/lib/typescript/src/download/modelExtraction.d.ts +36 -0
  140. package/lib/typescript/src/download/modelExtraction.d.ts.map +1 -0
  141. package/lib/typescript/src/download/paths.d.ts +28 -0
  142. package/lib/typescript/src/download/paths.d.ts.map +1 -0
  143. package/lib/typescript/src/download/postDownloadProcessing.d.ts +19 -0
  144. package/lib/typescript/src/download/postDownloadProcessing.d.ts.map +1 -0
  145. package/lib/typescript/src/download/protectedModelKeys.d.ts +6 -0
  146. package/lib/typescript/src/download/protectedModelKeys.d.ts.map +1 -0
  147. package/lib/typescript/src/download/registry.d.ts +14 -0
  148. package/lib/typescript/src/download/registry.d.ts.map +1 -0
  149. package/lib/typescript/src/download/retry.d.ts +15 -0
  150. package/lib/typescript/src/download/retry.d.ts.map +1 -0
  151. package/lib/typescript/src/download/types.d.ts +96 -0
  152. package/lib/typescript/src/download/types.d.ts.map +1 -0
  153. package/lib/typescript/src/download/validation.d.ts +19 -0
  154. package/lib/typescript/src/download/validation.d.ts.map +1 -1
  155. package/lib/typescript/src/extraction/extractTarBz2.d.ts.map +1 -0
  156. package/lib/typescript/src/extraction/extractTarZst.d.ts.map +1 -0
  157. package/lib/typescript/src/index.d.ts +1 -0
  158. package/lib/typescript/src/index.d.ts.map +1 -1
  159. package/lib/typescript/src/licenses.d.ts +10 -0
  160. package/lib/typescript/src/licenses.d.ts.map +1 -0
  161. package/lib/typescript/src/stt/index.d.ts +4 -1
  162. package/lib/typescript/src/stt/index.d.ts.map +1 -1
  163. package/lib/typescript/src/stt/streaming.d.ts.map +1 -1
  164. package/lib/typescript/src/stt/streamingTypes.d.ts +5 -0
  165. package/lib/typescript/src/stt/streamingTypes.d.ts.map +1 -1
  166. package/lib/typescript/src/stt/types.d.ts +3 -1
  167. package/lib/typescript/src/stt/types.d.ts.map +1 -1
  168. package/lib/typescript/src/tts/index.d.ts +4 -2
  169. package/lib/typescript/src/tts/index.d.ts.map +1 -1
  170. package/lib/typescript/src/tts/streaming.d.ts.map +1 -1
  171. package/lib/typescript/src/tts/types.d.ts +12 -6
  172. package/lib/typescript/src/tts/types.d.ts.map +1 -1
  173. package/lib/typescript/src/utils.d.ts +5 -0
  174. package/lib/typescript/src/utils.d.ts.map +1 -1
  175. package/package.json +6 -1
  176. package/scripts/{check-model-csvs.sh → ci/check-model-csvs.sh} +9 -2
  177. package/scripts/ci/collect_all_sherpa_model_streams.sh +101 -0
  178. package/scripts/ci/collect_one_sherpa_release_stream.sh +189 -0
  179. package/scripts/ci/sherpa_asr_model_release_streams.json +21 -0
  180. package/scripts/ci/sherpa_tts_model_release_streams.json +13 -0
  181. package/scripts/ci/update_model_license_csv.sh +765 -0
  182. package/scripts/setup-ios-framework.sh +14 -11
  183. package/scripts/update_commercial_use.js +73 -0
  184. package/src/NativeSherpaOnnx.ts +37 -6
  185. package/src/audio/index.ts +20 -0
  186. package/src/download/ModelDownloadManager.ts +57 -1343
  187. package/src/download/activeModelOperations.ts +38 -0
  188. package/src/download/background-downloader-types.ts +73 -0
  189. package/src/download/bulkPurge.ts +102 -0
  190. package/src/download/checksumPrompt.ts +25 -0
  191. package/src/download/constants.ts +5 -0
  192. package/src/download/downloadEvents.ts +55 -0
  193. package/src/download/downloadTask.ts +565 -0
  194. package/src/download/ensureModel.ts +124 -0
  195. package/src/download/index.ts +21 -4
  196. package/src/download/localModels.ts +234 -0
  197. package/src/download/modelExtraction.ts +244 -0
  198. package/src/download/paths.ts +134 -0
  199. package/src/download/postDownloadProcessing.ts +292 -0
  200. package/src/download/protectedModelKeys.ts +30 -0
  201. package/src/download/registry.ts +405 -0
  202. package/src/download/retry.ts +76 -0
  203. package/src/download/types.ts +120 -0
  204. package/src/download/validation.ts +114 -8
  205. package/src/{download → extraction}/extractTarBz2.ts +3 -1
  206. package/src/{download → extraction}/extractTarZst.ts +3 -1
  207. package/src/extraction/index.ts +3 -7
  208. package/src/index.tsx +1 -0
  209. package/src/licenses.ts +100 -0
  210. package/src/stt/index.ts +20 -2
  211. package/src/stt/streaming.ts +3 -0
  212. package/src/stt/streamingTypes.ts +5 -0
  213. package/src/stt/types.ts +3 -1
  214. package/src/tts/index.ts +33 -2
  215. package/src/tts/streaming.ts +12 -0
  216. package/src/tts/types.ts +15 -5
  217. package/src/utils.ts +22 -1
  218. package/third_party/sherpa-onnx-prebuilt/ANDROID_RELEASE_TAG +1 -1
  219. package/third_party/sherpa-onnx-prebuilt/IOS_RELEASE_TAG +1 -1
  220. package/android/src/main/cpp/jni/tts/sherpa-onnx-tts-zipvoice-jni.cpp +0 -301
  221. package/android/src/main/java/com/sherpaonnx/ZipvoiceTtsWrapper.kt +0 -187
  222. package/lib/module/download/extractTarBz2.js.map +0 -1
  223. package/lib/module/download/extractTarZst.js.map +0 -1
  224. package/lib/typescript/src/download/extractTarBz2.d.ts.map +0 -1
  225. package/lib/typescript/src/download/extractTarZst.d.ts.map +0 -1
  226. package/scripts/check-qnn-support.sh +0 -78
  227. /package/lib/typescript/src/{download → extraction}/extractTarBz2.d.ts +0 -0
  228. /package/lib/typescript/src/{download → extraction}/extractTarZst.d.ts +0 -0
@@ -29,7 +29,7 @@ get_filename_component(PROJECT_ROOT "${CMAKE_CURRENT_SOURCE_DIR}/../../.." ABSOL
29
29
  set(USE_FFMPEG ON)
30
30
  if(SHERPA_ONNX_DISABLE_FFMPEG)
31
31
  set(USE_FFMPEG OFF)
32
- message(STATUS "FFmpeg disabled (SHERPA_ONNX_DISABLE_FFMPEG=ON). convertAudioToWav16k/convertAudioToFormat will return an error at runtime.")
32
+ message(STATUS "FFmpeg disabled (SHERPA_ONNX_DISABLE_FFMPEG=ON). convertAudioToWav16k/convertAudioToFormat and decode (non-WAV) will return an error at runtime.")
33
33
  endif()
34
34
 
35
35
  if(USE_FFMPEG)
@@ -39,25 +39,32 @@ set(FFMPEG_PREBUILT_BASE "${PROJECT_ROOT}/../third_party/ffmpeg_prebuilt/android
39
39
  set(FFMPEG_PREBUILT_LIB "${FFMPEG_PREBUILT_BASE}/${ANDROID_ABI}/lib")
40
40
  set(FFMPEG_JNILIBS "${PROJECT_ROOT}/src/main/jniLibs/${ANDROID_ABI}")
41
41
  set(FFMPEG_INCLUDE_CPP "${PROJECT_ROOT}/src/main/cpp/include/ffmpeg")
42
- if(EXISTS "${FFMPEG_PREBUILT_BASE}/include")
42
+ # Require a real header — an empty third_party/.../include/ dir must not win over Gradle-populated cpp/include/ffmpeg.
43
+ if(EXISTS "${FFMPEG_PREBUILT_BASE}/include/libavcodec/avcodec.h")
43
44
  set(FFMPEG_INCLUDE_DIR "${FFMPEG_PREBUILT_BASE}/include")
44
45
  message(STATUS "FFmpeg headers: prebuilts ${FFMPEG_INCLUDE_DIR}")
45
- elseif(EXISTS "${FFMPEG_INCLUDE_CPP}")
46
+ elseif(EXISTS "${FFMPEG_INCLUDE_CPP}/libavcodec/avcodec.h")
46
47
  set(FFMPEG_INCLUDE_DIR "${FFMPEG_INCLUDE_CPP}")
47
- message(STATUS "FFmpeg headers: jniLibs/release ${FFMPEG_INCLUDE_DIR}")
48
+ message(STATUS "FFmpeg headers: module tree ${FFMPEG_INCLUDE_DIR}")
48
49
  else()
49
50
  message(FATAL_ERROR "FFmpeg headers missing. Either:\n"
50
51
  " - Build: cd third_party/ffmpeg_prebuilt && ./build_ffmpeg.sh (creates android/include)\n"
51
52
  " - Or use a release that includes include/ (Gradle extracts to ${FFMPEG_INCLUDE_CPP})")
52
53
  endif()
54
+ # Prebuilts may be either legacy layout android/<abi>/lib/ (build_ffmpeg.sh) or
55
+ # android/jni/<abi>/ (same as Gradle THIRD_PARTY in docs/PREBUILT_RESOLUTION.md).
56
+ set(FFMPEG_PREBUILT_JNI_ABI "${FFMPEG_PREBUILT_BASE}/jni/${ANDROID_ABI}")
53
57
  if(EXISTS "${FFMPEG_PREBUILT_LIB}/libavcodec.so")
54
58
  set(FFMPEG_LIB_DIR "${FFMPEG_PREBUILT_LIB}")
55
59
  message(STATUS "FFmpeg libs: prebuilts ${FFMPEG_LIB_DIR}")
60
+ elseif(EXISTS "${FFMPEG_PREBUILT_JNI_ABI}/libavcodec.so")
61
+ set(FFMPEG_LIB_DIR "${FFMPEG_PREBUILT_JNI_ABI}")
62
+ message(STATUS "FFmpeg libs: prebuilts (jni/<abi>) ${FFMPEG_LIB_DIR}")
56
63
  elseif(EXISTS "${FFMPEG_JNILIBS}/libavcodec.so")
57
64
  set(FFMPEG_LIB_DIR "${FFMPEG_JNILIBS}")
58
65
  message(STATUS "FFmpeg libs: jniLibs ${FFMPEG_LIB_DIR}")
59
66
  else()
60
- message(FATAL_ERROR "FFmpeg libs missing for ABI ${ANDROID_ABI}. Run third_party/ffmpeg_prebuilt/copy_prebuilts_to_sdk.js or ensure ANDROID_RELEASE_TAG release is downloaded.")
67
+ message(FATAL_ERROR "FFmpeg libs missing for ABI ${ANDROID_ABI}. Run a Gradle Android build so prebuilts populate jniLibs, or add local .so under jniLibs / ffmpeg prebuilts paths.")
61
68
  endif()
62
69
  endif(USE_FFMPEG)
63
70
 
@@ -82,7 +89,6 @@ set(SOURCES
82
89
  jni/model_detect/sherpa-onnx-stt-wrapper.cpp
83
90
  jni/model_detect/sherpa-onnx-tts-wrapper.cpp
84
91
  jni/audio/sherpa-onnx-audio-convert-jni.cpp
85
- jni/tts/sherpa-onnx-tts-zipvoice-jni.cpp
86
92
  crypto/sha256.cpp
87
93
  )
88
94
 
@@ -93,10 +99,19 @@ set(LIBARCHIVE_PREBUILT_LIB "${LIBARCHIVE_PREBUILT_BASE}/${ANDROID_ABI}/lib")
93
99
  set(LIBARCHIVE_JNILIBS "${PROJECT_ROOT}/src/main/jniLibs/${ANDROID_ABI}")
94
100
  set(LIBARCHIVE_INCLUDE_CPP "${PROJECT_ROOT}/src/main/cpp/include/libarchive")
95
101
  set(USE_LIBARCHIVE_PREBUILT OFF)
102
+ set(LIBARCHIVE_PREBUILT_JNI_ABI "${LIBARCHIVE_PREBUILT_BASE}/jni/${ANDROID_ABI}")
96
103
  if(EXISTS "${LIBARCHIVE_PREBUILT_LIB}/libarchive.so")
97
104
  set(USE_LIBARCHIVE_PREBUILT ON)
98
105
  set(LIBARCHIVE_LIB_DIR "${LIBARCHIVE_PREBUILT_LIB}")
99
- if(EXISTS "${LIBARCHIVE_PREBUILT_BASE}/include")
106
+ if(EXISTS "${LIBARCHIVE_PREBUILT_BASE}/include/archive.h")
107
+ set(LIBARCHIVE_INCLUDE_DIR "${LIBARCHIVE_PREBUILT_BASE}/include")
108
+ elseif(EXISTS "${LIBARCHIVE_INCLUDE_CPP}/archive.h")
109
+ set(LIBARCHIVE_INCLUDE_DIR "${LIBARCHIVE_INCLUDE_CPP}")
110
+ endif()
111
+ elseif(EXISTS "${LIBARCHIVE_PREBUILT_JNI_ABI}/libarchive.so")
112
+ set(USE_LIBARCHIVE_PREBUILT ON)
113
+ set(LIBARCHIVE_LIB_DIR "${LIBARCHIVE_PREBUILT_JNI_ABI}")
114
+ if(EXISTS "${LIBARCHIVE_PREBUILT_BASE}/include/archive.h")
100
115
  set(LIBARCHIVE_INCLUDE_DIR "${LIBARCHIVE_PREBUILT_BASE}/include")
101
116
  elseif(EXISTS "${LIBARCHIVE_INCLUDE_CPP}/archive.h")
102
117
  set(LIBARCHIVE_INCLUDE_DIR "${LIBARCHIVE_INCLUDE_CPP}")
@@ -128,15 +143,19 @@ add_library(sherpaonnx SHARED
128
143
  # If we used IMPORTED here, AGP would also copy .so from CMake --> duplicate in mergeNativeLibs.
129
144
  set(SHERPA_ONNX_PREBUILT_BASE "${PROJECT_ROOT}/../third_party/sherpa-onnx-prebuilt/android")
130
145
  set(SHERPA_ONNX_ABI_LIB "${SHERPA_ONNX_PREBUILT_BASE}/${ANDROID_ABI}/lib")
146
+ set(SHERPA_ONNX_JNI_ABI "${SHERPA_ONNX_PREBUILT_BASE}/jni/${ANDROID_ABI}")
131
147
  set(SHERPA_C_API_LIB_DIR "")
132
148
  if(EXISTS "${SHERPA_ONNX_ABI_LIB}/libsherpa-onnx-c-api.so")
133
149
  set(SHERPA_C_API_LIB_DIR "${SHERPA_ONNX_ABI_LIB}")
134
150
  message(STATUS "sherpa-onnx C-API (link only): ${SHERPA_C_API_LIB_DIR}")
151
+ elseif(EXISTS "${SHERPA_ONNX_JNI_ABI}/libsherpa-onnx-c-api.so")
152
+ set(SHERPA_C_API_LIB_DIR "${SHERPA_ONNX_JNI_ABI}")
153
+ message(STATUS "sherpa-onnx C-API (link only, third_party jni/<abi>): ${SHERPA_C_API_LIB_DIR}")
135
154
  elseif(EXISTS "${PROJECT_ROOT}/src/main/jniLibs/${ANDROID_ABI}/libsherpa-onnx-c-api.so")
136
155
  set(SHERPA_C_API_LIB_DIR "${PROJECT_ROOT}/src/main/jniLibs/${ANDROID_ABI}")
137
156
  message(STATUS "sherpa-onnx C-API (link only, jniLibs): ${SHERPA_C_API_LIB_DIR}")
138
157
  else()
139
- message(WARNING "sherpa-onnx C-API not found. Zipvoice TTS will not be available. "
158
+ message(WARNING "sherpa-onnx C-API not found. Prebuilt sherpa native libs may be incomplete. "
140
159
  "Build prebuilts: cd third_party/sherpa-onnx-prebuilt && ./build_sherpa_onnx.sh")
141
160
  endif()
142
161
 
@@ -148,7 +167,6 @@ target_include_directories(sherpaonnx PRIVATE
148
167
  ${CMAKE_CURRENT_SOURCE_DIR}/jni/archive
149
168
  ${CMAKE_CURRENT_SOURCE_DIR}/jni/model_detect
150
169
  ${CMAKE_CURRENT_SOURCE_DIR}/jni/audio
151
- ${CMAKE_CURRENT_SOURCE_DIR}/jni/tts
152
170
  ${CMAKE_CURRENT_SOURCE_DIR}/include
153
171
  )
154
172
  if(USE_FFMPEG)
@@ -167,7 +185,7 @@ if(USE_LIBARCHIVE)
167
185
  target_compile_definitions(sherpaonnx PRIVATE HAVE_LIBARCHIVE=1)
168
186
  endif()
169
187
 
170
- # Link libraries (Kotlin API from AAR handles STT/TTS; C-API only for Zipvoice)
188
+ # Link libraries (Kotlin API from AAR handles STT/TTS incl. Zipvoice)
171
189
  if(USE_FFMPEG)
172
190
  target_link_directories(sherpaonnx PRIVATE ${FFMPEG_LIB_DIR})
173
191
  endif()
@@ -157,7 +157,7 @@ bool ArchiveHelper::ExtractTarBz2(
157
157
  // If target exists and is a directory, extract into it (merge). Otherwise require empty or force-remove.
158
158
  if (std::filesystem::exists(target_path)) {
159
159
  if (std::filesystem::is_directory(target_path)) {
160
- // Merge: extract into existing directory (e.g. multiple archives same base path)
160
+ // Merge: extract into existing directory (e.g. multiple archives --> same base path)
161
161
  } else if (force) {
162
162
  std::error_code ec;
163
163
  std::filesystem::remove_all(target_path, ec);
@@ -459,7 +459,7 @@ bool ArchiveHelper::ExtractFromStream(
459
459
 
460
460
  if (std::filesystem::exists(target_path)) {
461
461
  if (std::filesystem::is_directory(target_path)) {
462
- // Merge: extract into existing directory (e.g. multiple archives same base path)
462
+ // Merge: extract into existing directory (e.g. multiple archives --> same base path)
463
463
  } else if (force) {
464
464
  std::error_code ec;
465
465
  std::filesystem::remove_all(target_path, ec);
@@ -9,6 +9,7 @@
9
9
  #include <jni.h>
10
10
  #include <string>
11
11
  #include <sys/stat.h>
12
+ #include <vector>
12
13
 
13
14
  #define LOG_TAG "AudioConvertJNI"
14
15
  #define LOGI(...) __android_log_print(ANDROID_LOG_INFO, LOG_TAG, __VA_ARGS__)
@@ -24,11 +25,14 @@ extern "C" {
24
25
  #include <libswresample/swresample.h>
25
26
  }
26
27
  #include <cstdio>
27
- #include <vector>
28
28
  #endif
29
29
 
30
30
  // Forward declaration — convertToFormat handles all formats including WAV (16 kHz mono).
31
31
  static std::string convertToFormat(const char* inputPath, const char* outputPath, const char* formatHint, int outputSampleRateHz);
32
+ static std::string decodeAudioFileToFloatMono(const char* inputPath,
33
+ int targetSampleRateHz,
34
+ std::vector<float>* outSamples,
35
+ int* outSampleRate);
32
36
 
33
37
  // Convenience: convert any audio to 16 kHz mono WAV via the main convertToFormat pipeline.
34
38
  static std::string convertToWav16kMono(const char* inputPath, const char* outputPath) {
@@ -614,7 +618,8 @@ static std::string convertToFormat(const char* inputPath, const char* outputPath
614
618
  av_packet_unref(pkt);
615
619
  continue;
616
620
  }
617
- int converted = swr_convert(swr, outData, (int)out_nb_samples, (const uint8_t**)frame->data, frame->nb_samples);
621
+ const uint8_t* const* in_data = frame->extended_data ? frame->extended_data : frame->data;
622
+ int converted = swr_convert(swr, outData, (int)out_nb_samples, in_data, frame->nb_samples);
618
623
  if (converted <= 0) {
619
624
  av_freep(&outData[0]);
620
625
  av_freep(&outData);
@@ -701,6 +706,204 @@ static std::string convertToFormat(const char* inputPath, const char* outputPath
701
706
  #endif
702
707
  }
703
708
 
709
+ // Decode any FFmpeg-supported audio to mono float PCM in [-1,1] (clipping not applied) at outSampleRate.
710
+ static std::string decodeAudioFileToFloatMono(const char* inputPath,
711
+ int targetSampleRateHz,
712
+ std::vector<float>* outSamples,
713
+ int* outSampleRate) {
714
+ outSamples->clear();
715
+ *outSampleRate = 0;
716
+ #ifndef HAVE_FFMPEG
717
+ (void)inputPath;
718
+ (void)targetSampleRateHz;
719
+ return std::string("FFmpeg not available. Build prebuilts with third_party/ffmpeg_prebuilt/build_ffmpeg.ps1 or build_ffmpeg.sh.");
720
+ #else
721
+ if (!inputPath) {
722
+ return std::string("inputPath is null");
723
+ }
724
+
725
+ AVFormatContext* inFmt = nullptr;
726
+ if (avformat_open_input(&inFmt, inputPath, nullptr, nullptr) < 0) {
727
+ LOGE("decodeAudioFileToFloatMono: failed to open inputPath=%s", inputPath);
728
+ return std::string("Failed to open input file");
729
+ }
730
+ if (avformat_find_stream_info(inFmt, nullptr) < 0) {
731
+ avformat_close_input(&inFmt);
732
+ return std::string("Failed to find stream info");
733
+ }
734
+
735
+ int audioStreamIndex = -1;
736
+ for (unsigned i = 0; i < inFmt->nb_streams; ++i) {
737
+ if (inFmt->streams[i]->codecpar->codec_type == AVMEDIA_TYPE_AUDIO) {
738
+ audioStreamIndex = (int)i;
739
+ break;
740
+ }
741
+ }
742
+ if (audioStreamIndex < 0) {
743
+ avformat_close_input(&inFmt);
744
+ return std::string("No audio stream found in input");
745
+ }
746
+
747
+ AVStream* inStream = inFmt->streams[audioStreamIndex];
748
+ const AVCodec* decoder = avcodec_find_decoder(inStream->codecpar->codec_id);
749
+ if (!decoder) {
750
+ avformat_close_input(&inFmt);
751
+ return std::string("Unsupported input codec");
752
+ }
753
+
754
+ AVCodecContext* decCtx = avcodec_alloc_context3(decoder);
755
+ if (!decCtx) {
756
+ avformat_close_input(&inFmt);
757
+ return std::string("Failed to allocate decoder context");
758
+ }
759
+ if (avcodec_parameters_to_context(decCtx, inStream->codecpar) < 0) {
760
+ avcodec_free_context(&decCtx);
761
+ avformat_close_input(&inFmt);
762
+ return std::string("Failed to copy codec parameters");
763
+ }
764
+ if (avcodec_open2(decCtx, decoder, nullptr) < 0) {
765
+ avcodec_free_context(&decCtx);
766
+ avformat_close_input(&inFmt);
767
+ return std::string("Failed to open decoder");
768
+ }
769
+
770
+ int in_sr = decCtx->sample_rate;
771
+ if (inStream->codecpar->sample_rate > 0) {
772
+ in_sr = inStream->codecpar->sample_rate;
773
+ }
774
+ if (in_sr <= 0) {
775
+ avcodec_free_context(&decCtx);
776
+ avformat_close_input(&inFmt);
777
+ return std::string("Invalid input sample rate");
778
+ }
779
+
780
+ int out_sr = (targetSampleRateHz > 0) ? targetSampleRateHz : in_sr;
781
+ if (out_sr <= 0) {
782
+ avcodec_free_context(&decCtx);
783
+ avformat_close_input(&inFmt);
784
+ return std::string("Invalid output sample rate");
785
+ }
786
+
787
+ AVChannelLayout in_layout{};
788
+ if (inStream->codecpar->ch_layout.nb_channels > 0) {
789
+ if (av_channel_layout_copy(&in_layout, &inStream->codecpar->ch_layout) < 0) {
790
+ avcodec_free_context(&decCtx);
791
+ avformat_close_input(&inFmt);
792
+ return std::string("Failed to copy input channel layout");
793
+ }
794
+ } else {
795
+ if (av_channel_layout_copy(&in_layout, &decCtx->ch_layout) < 0) {
796
+ avcodec_free_context(&decCtx);
797
+ avformat_close_input(&inFmt);
798
+ return std::string("Failed to get decoder channel layout");
799
+ }
800
+ }
801
+
802
+ AVChannelLayout out_layout = AV_CHANNEL_LAYOUT_MONO;
803
+ SwrContext* swr = nullptr;
804
+ if (swr_alloc_set_opts2(&swr,
805
+ &out_layout,
806
+ AV_SAMPLE_FMT_FLT,
807
+ out_sr,
808
+ &in_layout,
809
+ decCtx->sample_fmt,
810
+ in_sr,
811
+ 0,
812
+ nullptr) < 0 ||
813
+ !swr) {
814
+ av_channel_layout_uninit(&in_layout);
815
+ avcodec_free_context(&decCtx);
816
+ avformat_close_input(&inFmt);
817
+ return std::string("Failed to initialize resampler");
818
+ }
819
+ if (swr_init(swr) < 0) {
820
+ av_channel_layout_uninit(&in_layout);
821
+ swr_free(&swr);
822
+ avcodec_free_context(&decCtx);
823
+ avformat_close_input(&inFmt);
824
+ return std::string("Failed to initialize resampler (swr_init)");
825
+ }
826
+ av_channel_layout_uninit(&in_layout);
827
+
828
+ AVPacket* pkt = av_packet_alloc();
829
+ AVFrame* frame = av_frame_alloc();
830
+ if (!pkt || !frame) {
831
+ if (pkt) av_packet_free(&pkt);
832
+ if (frame) av_frame_free(&frame);
833
+ swr_free(&swr);
834
+ avcodec_free_context(&decCtx);
835
+ avformat_close_input(&inFmt);
836
+ return std::string("Out of memory");
837
+ }
838
+
839
+ auto appendConverted = [&](uint8_t* buf, int nbFloats) {
840
+ if (!buf || nbFloats <= 0) return;
841
+ const float* f = reinterpret_cast<const float*>(buf);
842
+ outSamples->insert(outSamples->end(), f, f + nbFloats);
843
+ };
844
+
845
+ auto convertOneFrame = [&](AVFrame* fr) {
846
+ const uint8_t* const* in_data = fr->extended_data ? fr->extended_data : fr->data;
847
+ int in_sr2 = inStream->codecpar->sample_rate ? inStream->codecpar->sample_rate : decCtx->sample_rate;
848
+ int64_t max_out =
849
+ av_rescale_rnd(swr_get_delay(swr, in_sr2) + (int64_t)fr->nb_samples, out_sr, in_sr2, AV_ROUND_UP);
850
+ if (max_out < 1) max_out = 1;
851
+ uint8_t* out_buf = nullptr;
852
+ if (av_samples_alloc(&out_buf, nullptr, 1, (int)max_out, AV_SAMPLE_FMT_FLT, 0) < 0) {
853
+ return;
854
+ }
855
+ int converted = swr_convert(swr, &out_buf, (int)max_out, in_data, fr->nb_samples);
856
+ if (converted > 0) {
857
+ appendConverted(out_buf, converted);
858
+ }
859
+ av_freep(&out_buf);
860
+ };
861
+
862
+ while (av_read_frame(inFmt, pkt) >= 0) {
863
+ if (pkt->stream_index == audioStreamIndex) {
864
+ if (avcodec_send_packet(decCtx, pkt) == 0) {
865
+ while (avcodec_receive_frame(decCtx, frame) == 0) {
866
+ convertOneFrame(frame);
867
+ av_frame_unref(frame);
868
+ }
869
+ }
870
+ }
871
+ av_packet_unref(pkt);
872
+ }
873
+
874
+ if (avcodec_send_packet(decCtx, nullptr) == 0) {
875
+ while (avcodec_receive_frame(decCtx, frame) == 0) {
876
+ convertOneFrame(frame);
877
+ av_frame_unref(frame);
878
+ }
879
+ }
880
+
881
+ {
882
+ int in_sr2 = inStream->codecpar->sample_rate ? inStream->codecpar->sample_rate : decCtx->sample_rate;
883
+ int tailCap = (int)swr_get_delay(swr, in_sr2) + 4096;
884
+ if (tailCap < 16) tailCap = 16;
885
+ uint8_t* tailData = nullptr;
886
+ if (av_samples_alloc(&tailData, nullptr, 1, tailCap, AV_SAMPLE_FMT_FLT, 0) >= 0) {
887
+ int tailConverted = swr_convert(swr, &tailData, tailCap, nullptr, 0);
888
+ if (tailConverted > 0) {
889
+ appendConverted(tailData, tailConverted);
890
+ }
891
+ av_freep(&tailData);
892
+ }
893
+ }
894
+
895
+ av_packet_free(&pkt);
896
+ av_frame_free(&frame);
897
+ swr_free(&swr);
898
+ avcodec_free_context(&decCtx);
899
+ avformat_close_input(&inFmt);
900
+
901
+ *outSampleRate = out_sr;
902
+ LOGI("decodeAudioFileToFloatMono: samples=%zu sampleRate=%d", outSamples->size(), out_sr);
903
+ return std::string("");
904
+ #endif
905
+ }
906
+
704
907
  extern "C" {
705
908
 
706
909
  // Called from Kotlin: SherpaOnnxModule.nativeConvertAudioToWav16k(inputPath, outputPath) -> Boolean
@@ -759,4 +962,67 @@ Java_com_sherpaonnx_SherpaOnnxModule_nativeConvertAudioToFormat(
759
962
  return env->NewStringUTF(err.c_str());
760
963
  }
761
964
 
965
+ // Returns Object[]: on error [String message]; on success [float[] samples, Integer sampleRate].
966
+ JNIEXPORT jobjectArray JNICALL
967
+ Java_com_sherpaonnx_SherpaOnnxModule_nativeDecodeAudioFileToFloatSamples(JNIEnv* env,
968
+ jobject /* this */,
969
+ jstring inputPath,
970
+ jint targetSampleRateHz) {
971
+ jclass objectClass = env->FindClass("java/lang/Object");
972
+ if (!objectClass) {
973
+ return nullptr;
974
+ }
975
+
976
+ auto makeError = [&](const char* msg) -> jobjectArray {
977
+ jobjectArray ret = env->NewObjectArray(1, objectClass, nullptr);
978
+ if (!ret) return nullptr;
979
+ jstring jmsg = env->NewStringUTF(msg);
980
+ env->SetObjectArrayElement(ret, 0, jmsg);
981
+ env->DeleteLocalRef(jmsg);
982
+ return ret;
983
+ };
984
+
985
+ if (inputPath == nullptr) {
986
+ return makeError("inputPath must be non-null");
987
+ }
988
+ const char* input = env->GetStringUTFChars(inputPath, nullptr);
989
+ if (input == nullptr) {
990
+ return makeError("Failed to get path string");
991
+ }
992
+
993
+ std::vector<float> samples;
994
+ int sampleRate = 0;
995
+ std::string err = decodeAudioFileToFloatMono(input, (int)targetSampleRateHz, &samples, &sampleRate);
996
+ env->ReleaseStringUTFChars(inputPath, input);
997
+
998
+ if (!err.empty()) {
999
+ return makeError(err.c_str());
1000
+ }
1001
+
1002
+ jfloatArray jfloats = env->NewFloatArray((jsize)samples.size());
1003
+ if (!jfloats) {
1004
+ return makeError("Failed to allocate float array");
1005
+ }
1006
+ if (!samples.empty()) {
1007
+ env->SetFloatArrayRegion(jfloats, 0, (jsize)samples.size(), samples.data());
1008
+ }
1009
+
1010
+ jobjectArray ret = env->NewObjectArray(2, objectClass, nullptr);
1011
+ if (!ret) {
1012
+ env->DeleteLocalRef(jfloats);
1013
+ return makeError("Failed to allocate result array");
1014
+ }
1015
+ env->SetObjectArrayElement(ret, 0, jfloats);
1016
+
1017
+ jclass intCls = env->FindClass("java/lang/Integer");
1018
+ jmethodID intCtor = env->GetMethodID(intCls, "<init>", "(I)V");
1019
+ jobject jrate = env->NewObject(intCls, intCtor, sampleRate);
1020
+ env->SetObjectArrayElement(ret, 1, jrate);
1021
+
1022
+ env->DeleteLocalRef(jfloats);
1023
+ env->DeleteLocalRef(jrate);
1024
+ env->DeleteLocalRef(intCls);
1025
+ return ret;
1026
+ }
1027
+
762
1028
  } // extern "C"
@@ -2,7 +2,7 @@
2
2
  * sherpa-onnx-model-detect-tts.cpp
3
3
  *
4
4
  * Purpose: Detects TTS model type and fills TtsModelPaths from a model directory. Used by
5
- * nativeDetectTtsModel (module-jni). Supports Vits, Matcha, Kokoro, Kitten, Pocket, Zipvoice.
5
+ * nativeDetectTtsModel (module-jni). Supports Vits, Matcha, Kokoro, Kitten, Pocket, Zipvoice, Supertonic.
6
6
  *
7
7
  * --- Detection pipeline (overview) ---
8
8
  *
@@ -56,18 +56,20 @@ TtsModelKind ParseTtsModelType(const std::string& modelType) {
56
56
  if (modelType == "kitten") return TtsModelKind::kKitten;
57
57
  if (modelType == "pocket") return TtsModelKind::kPocket;
58
58
  if (modelType == "zipvoice") return TtsModelKind::kZipvoice;
59
+ if (modelType == "supertonic") return TtsModelKind::kSupertonic;
59
60
  return TtsModelKind::kUnknown;
60
61
  }
61
62
 
62
63
  /** Returns true if the given kind is supported by the current paths and hints (required files present).
63
- * data_dir (espeak-ng-data) is required only for Kitten and Kokoro (sherpa-onnx config Validate());
64
- * VITS, Matcha, Zipvoice use it optionally; Pocket does not use it. */
64
+ * data_dir (espeak-ng-data) is required for Kitten, Kokoro, and Zipvoice (Zipvoice uses MatchaTtsLexicon + espeak).
65
+ * VITS and Matcha use dataDir optionally in this detector; Pocket does not use it. */
65
66
  static bool CapabilitySupportsTtsKind(
66
67
  TtsModelKind kind,
67
68
  bool hasVits,
68
69
  bool hasMatcha,
69
70
  bool hasPocket,
70
71
  bool hasZipvoice,
72
+ bool hasSupertonic,
71
73
  bool hasVoicesFile,
72
74
  bool hasDataDir
73
75
  ) {
@@ -83,6 +85,8 @@ static bool CapabilitySupportsTtsKind(
83
85
  return hasPocket;
84
86
  case TtsModelKind::kZipvoice:
85
87
  return hasZipvoice;
88
+ case TtsModelKind::kSupertonic:
89
+ return hasSupertonic;
86
90
  default:
87
91
  return false;
88
92
  }
@@ -108,6 +112,7 @@ static std::vector<TtsModelKind> GetKindsFromDirNameTts(const std::string& model
108
112
  if (lower.find("matcha") != std::string::npos) add(TtsModelKind::kMatcha);
109
113
  if (lower.find("pocket") != std::string::npos) add(TtsModelKind::kPocket);
110
114
  if (lower.find("zipvoice") != std::string::npos) add(TtsModelKind::kZipvoice);
115
+ if (lower.find("supertonic") != std::string::npos) add(TtsModelKind::kSupertonic);
111
116
  if (lower.find("kokoro") != std::string::npos) add(TtsModelKind::kKokoro);
112
117
  if (lower.find("kitten") != std::string::npos) add(TtsModelKind::kKitten);
113
118
  if (lower.find("vits") != std::string::npos) add(TtsModelKind::kVits);
@@ -128,6 +133,10 @@ static TtsDetectResult DetectTtsModelFromFiles(
128
133
  std::string tokensFile = FindFileByName(files, "tokens.txt");
129
134
  std::vector<LexiconCandidate> lexiconCandidates = FindLexiconCandidates(files, modelDir);
130
135
  std::string dataDirPath = FindDirectoryUnderRoot(files, modelDir, "espeak-ng-data");
136
+ LOGI("DetectTtsModel: modelDir=%s espeak-ng dataDir=%s (empty=%d)",
137
+ modelDir.c_str(),
138
+ dataDirPath.empty() ? "(empty)" : dataDirPath.c_str(),
139
+ (int)dataDirPath.empty());
131
140
  std::string voicesFile = FindFileByName(files, "voices.bin");
132
141
 
133
142
  std::string acousticModel = FindOnnxByAnyToken(files, {"acoustic_model", "acoustic-model"}, std::nullopt);
@@ -139,16 +148,27 @@ static TtsDetectResult DetectTtsModelFromFiles(
139
148
  std::string textConditioner = FindOnnxByAnyToken(files, {"text_conditioner", "text-conditioner"}, std::nullopt);
140
149
  std::string vocabJsonFile = FindFileByName(files, "vocab.json");
141
150
  std::string tokenScoresJsonFile = FindFileByName(files, "token_scores.json");
151
+ std::string durationPredictor = FindOnnxByAnyToken(files, {"duration_predictor", "duration-predictor"}, std::nullopt);
152
+ std::string textEncoderSupertonic = FindOnnxByAnyToken(files, {"text_encoder", "text-encoder"}, std::nullopt);
153
+ std::string vectorEstimator = FindOnnxByAnyToken(files, {"vector_estimator", "vector-estimator"}, std::nullopt);
154
+ std::string ttsJsonFile = FindFileByName(files, "tts.json");
155
+ std::string unicodeIndexerFile = FindFileByName(files, "unicode_indexer.bin");
156
+ std::string voiceStyleFile = FindFileByName(files, "voice.bin");
142
157
 
143
158
  std::vector<std::string> modelExcludes = {
144
- "acoustic", "vocoder", "encoder", "decoder", "joiner"
159
+ "acoustic", "vocoder", "encoder", "decoder", "joiner",
160
+ // Supertonic component models are not VITS monolithic model.onnx files.
161
+ "duration_predictor", "duration-predictor",
162
+ "text_encoder", "text-encoder",
163
+ "vector_estimator", "vector-estimator"
145
164
  };
146
165
  std::string ttsModel = FindOnnxByAnyToken(files, {"model"}, std::nullopt);
147
166
  if (ttsModel.empty()) {
148
167
  ttsModel = FindLargestOnnxExcludingTokens(files, modelExcludes);
149
168
  }
150
169
 
151
- bool hasVits = !ttsModel.empty();
170
+ // VITS requires both model.onnx-like file and tokens.txt
171
+ bool hasVits = !ttsModel.empty() && !tokensFile.empty();
152
172
  std::string modelDirLower = ToLower(modelDir);
153
173
  bool isLikelyMatcha = modelDirLower.find("matcha") != std::string::npos;
154
174
  bool hasMatcha = (!acousticModel.empty() && !vocoder.empty())
@@ -165,6 +185,9 @@ static TtsDetectResult DetectTtsModelFromFiles(
165
185
  }
166
186
  bool hasPocket = !lmFlow.empty() && !lmMain.empty() && !encoder.empty() && !decoder.empty() &&
167
187
  !textConditioner.empty() && !vocabJsonFile.empty() && !tokenScoresJsonFile.empty();
188
+ bool hasSupertonic = !durationPredictor.empty() && !textEncoderSupertonic.empty() &&
189
+ !vectorEstimator.empty() && !vocoder.empty() && !ttsJsonFile.empty() &&
190
+ !unicodeIndexerFile.empty() && !voiceStyleFile.empty();
168
191
  bool hasDataDir = !dataDirPath.empty();
169
192
 
170
193
  bool isLikelyKitten = modelDirLower.find("kitten") != std::string::npos;
@@ -173,6 +196,7 @@ static TtsDetectResult DetectTtsModelFromFiles(
173
196
  if (hasMatcha) result.detectedModels.push_back({"matcha", modelDir});
174
197
  if (hasPocket) result.detectedModels.push_back({"pocket", modelDir});
175
198
  if (hasZipvoice && !hasMatcha) result.detectedModels.push_back({"zipvoice", modelDir});
199
+ if (hasSupertonic) result.detectedModels.push_back({"supertonic", modelDir});
176
200
  if (hasVoicesFile) {
177
201
  if (isLikelyKitten && !isLikelyKokoro) {
178
202
  result.detectedModels.push_back({"kitten", modelDir});
@@ -201,7 +225,7 @@ static TtsDetectResult DetectTtsModelFromFiles(
201
225
  std::vector<TtsModelKind> nameCandidates = GetKindsFromDirNameTts(modelDir);
202
226
  if (!nameCandidates.empty()) {
203
227
  for (TtsModelKind k : nameCandidates) {
204
- if (CapabilitySupportsTtsKind(k, hasVits, hasMatcha, hasPocket, hasZipvoice,
228
+ if (CapabilitySupportsTtsKind(k, hasVits, hasMatcha, hasPocket, hasZipvoice, hasSupertonic,
205
229
  hasVoicesFile, hasDataDir)) {
206
230
  selected = k;
207
231
  break;
@@ -212,6 +236,7 @@ static TtsDetectResult DetectTtsModelFromFiles(
212
236
  if (hasMatcha) selected = TtsModelKind::kMatcha;
213
237
  else if (hasPocket) selected = TtsModelKind::kPocket;
214
238
  else if (hasZipvoice) selected = TtsModelKind::kZipvoice;
239
+ else if (hasSupertonic) selected = TtsModelKind::kSupertonic;
215
240
  else if (hasVoicesFile) {
216
241
  if (isLikelyKitten && !isLikelyKokoro) selected = TtsModelKind::kKitten;
217
242
  else if (isLikelyKokoro && !isLikelyKitten) selected = TtsModelKind::kKokoro;
@@ -252,6 +277,12 @@ static TtsDetectResult DetectTtsModelFromFiles(
252
277
  result.paths.textConditioner = textConditioner;
253
278
  result.paths.vocabJson = vocabJsonFile;
254
279
  result.paths.tokenScoresJson = tokenScoresJsonFile;
280
+ result.paths.durationPredictor = durationPredictor;
281
+ result.paths.textEncoder = textEncoderSupertonic;
282
+ result.paths.vectorEstimator = vectorEstimator;
283
+ result.paths.ttsJson = ttsJsonFile;
284
+ result.paths.unicodeIndexer = unicodeIndexerFile;
285
+ result.paths.voiceStyle = voiceStyleFile;
255
286
 
256
287
  auto validation = ValidateTtsPaths(selected, result.paths, modelDir);
257
288
  if (!validation.ok) {
@@ -38,7 +38,8 @@ enum class TtsModelKind {
38
38
  kKokoro,
39
39
  kKitten,
40
40
  kPocket,
41
- kZipvoice
41
+ kZipvoice,
42
+ kSupertonic
42
43
  };
43
44
 
44
45
  struct SttModelPaths {
@@ -154,6 +155,13 @@ struct TtsModelPaths {
154
155
  std::string textConditioner;
155
156
  std::string vocabJson;
156
157
  std::string tokenScoresJson;
158
+ // Supertonic TTS
159
+ std::string durationPredictor;
160
+ std::string textEncoder;
161
+ std::string vectorEstimator;
162
+ std::string ttsJson;
163
+ std::string unicodeIndexer;
164
+ std::string voiceStyle;
157
165
  };
158
166
 
159
167
  struct SttDetectResult {
@@ -20,6 +20,7 @@ const char* TtsModelKindToString(TtsModelKind k) {
20
20
  case TtsModelKind::kKitten: return "kitten";
21
21
  case TtsModelKind::kPocket: return "pocket";
22
22
  case TtsModelKind::kZipvoice: return "zipvoice";
23
+ case TtsModelKind::kSupertonic: return "supertonic";
23
24
  default: return "unknown";
24
25
  }
25
26
  }
@@ -78,6 +79,12 @@ jobject TtsDetectResultToJava(JNIEnv* env, const TtsDetectResult& result) {
78
79
  PutString(env, pathsMap, mapPut, "textConditioner", result.paths.textConditioner);
79
80
  PutString(env, pathsMap, mapPut, "vocabJson", result.paths.vocabJson);
80
81
  PutString(env, pathsMap, mapPut, "tokenScoresJson", result.paths.tokenScoresJson);
82
+ PutString(env, pathsMap, mapPut, "durationPredictor", result.paths.durationPredictor);
83
+ PutString(env, pathsMap, mapPut, "textEncoder", result.paths.textEncoder);
84
+ PutString(env, pathsMap, mapPut, "vectorEstimator", result.paths.vectorEstimator);
85
+ PutString(env, pathsMap, mapPut, "ttsJson", result.paths.ttsJson);
86
+ PutString(env, pathsMap, mapPut, "unicodeIndexer", result.paths.unicodeIndexer);
87
+ PutString(env, pathsMap, mapPut, "voiceStyle", result.paths.voiceStyle);
81
88
  jstring keyPaths = env->NewStringUTF("paths");
82
89
  env->CallObjectMethod(map, mapPut, keyPaths, pathsMap);
83
90
  env->DeleteLocalRef(keyPaths);
@@ -55,8 +55,18 @@ static const TtsFieldRequirement kZipvoiceReqs[] = {
55
55
  {"decoder", &TtsModelPaths::decoder, true},
56
56
  {"vocoder", &TtsModelPaths::vocoder, true},
57
57
  {"tokens", &TtsModelPaths::tokens, true},
58
- {"dataDir", &TtsModelPaths::dataDir, false},
59
- {"lexicon", &TtsModelPaths::lexicon, false},
58
+ {"dataDir", &TtsModelPaths::dataDir, true},
59
+ {"lexicon", &TtsModelPaths::lexicon, true},
60
+ };
61
+
62
+ static const TtsFieldRequirement kSupertonicReqs[] = {
63
+ {"durationPredictor", &TtsModelPaths::durationPredictor, true},
64
+ {"textEncoder", &TtsModelPaths::textEncoder, true},
65
+ {"vectorEstimator", &TtsModelPaths::vectorEstimator, true},
66
+ {"vocoder", &TtsModelPaths::vocoder, true},
67
+ {"ttsJson", &TtsModelPaths::ttsJson, true},
68
+ {"unicodeIndexer", &TtsModelPaths::unicodeIndexer, true},
69
+ {"voiceStyle", &TtsModelPaths::voiceStyle, true},
60
70
  };
61
71
 
62
72
  // ============================================================
@@ -79,6 +89,9 @@ static const TtsFieldRequirement* GetRequirements(TtsModelKind kind, size_t& cou
79
89
  case TtsModelKind::kZipvoice:
80
90
  count = std::size(kZipvoiceReqs);
81
91
  return kZipvoiceReqs;
92
+ case TtsModelKind::kSupertonic:
93
+ count = std::size(kSupertonicReqs);
94
+ return kSupertonicReqs;
82
95
  default:
83
96
  count = 0;
84
97
  return nullptr;
@@ -93,6 +106,7 @@ static const char* TtsKindToName(TtsModelKind k) {
93
106
  case TtsModelKind::kKitten: return "Kitten";
94
107
  case TtsModelKind::kPocket: return "Pocket";
95
108
  case TtsModelKind::kZipvoice: return "Zipvoice";
109
+ case TtsModelKind::kSupertonic: return "Supertonic";
96
110
  default: return "Unknown";
97
111
  }
98
112
  }
@@ -102,6 +116,8 @@ static const char* GetFieldHint(const char* fieldName) {
102
116
  return "Copy espeak-ng-data into the model directory.";
103
117
  if (std::strcmp(fieldName, "tokens") == 0)
104
118
  return "Ensure tokens.txt is present in the model directory.";
119
+ if (std::strcmp(fieldName, "lexicon") == 0)
120
+ return "Add lexicon.txt (or lexicon-<lang>.txt) from the official sherpa-onnx Zipvoice/Matcha release; without it the native engine aborts.";
105
121
  return nullptr;
106
122
  }
107
123