react-native-sherpa-onnx 0.3.6 → 0.3.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +1 -0
- package/README.md +92 -21
- package/SherpaOnnx.podspec +3 -0
- package/THIRD_PARTY_LICENSES/README.md +62 -0
- package/THIRD_PARTY_LICENSES/ffmpeg.txt +502 -0
- package/THIRD_PARTY_LICENSES/libarchive.txt +65 -0
- package/THIRD_PARTY_LICENSES/nvidia_omla.txt +181 -0
- package/THIRD_PARTY_LICENSES/onnxruntime.txt +21 -0
- package/THIRD_PARTY_LICENSES/opus.txt +44 -0
- package/THIRD_PARTY_LICENSES/sherpa-onnx.txt +201 -0
- package/THIRD_PARTY_LICENSES/shine.txt +482 -0
- package/THIRD_PARTY_LICENSES/zstd.txt +30 -0
- package/android/build.gradle +7 -3
- package/android/prebuilt-download.gradle +344 -152
- package/android/prebuilt-versions.gradle +1 -1
- package/android/src/main/assets/model_licenses/asr-models-license-status.csv +409 -0
- package/android/src/main/assets/model_licenses/qnn-asr-models-license-status.csv +695 -0
- package/android/src/main/assets/model_licenses/tts-models-license-status.csv +596 -0
- package/android/src/main/cpp/CMakeLists.txt +28 -10
- package/android/src/main/cpp/jni/archive/sherpa-onnx-archive-helper.cpp +2 -2
- package/android/src/main/cpp/jni/audio/sherpa-onnx-audio-convert-jni.cpp +268 -2
- package/android/src/main/cpp/jni/model_detect/sherpa-onnx-model-detect-tts.cpp +37 -6
- package/android/src/main/cpp/jni/model_detect/sherpa-onnx-model-detect.h +9 -1
- package/android/src/main/cpp/jni/model_detect/sherpa-onnx-tts-wrapper.cpp +7 -0
- package/android/src/main/cpp/jni/model_detect/sherpa-onnx-validate-tts.cpp +18 -2
- package/android/src/main/java/com/sherpaonnx/SherpaOnnxArchiveHelper.kt +40 -10
- package/android/src/main/java/com/sherpaonnx/SherpaOnnxModule.kt +99 -0
- package/android/src/main/java/com/sherpaonnx/SherpaOnnxOnlineSttHelper.kt +4 -1
- package/android/src/main/java/com/sherpaonnx/SherpaOnnxTtsHelper.kt +127 -97
- package/ios/Resources/model_licenses/asr-models-license-status.csv +409 -0
- package/ios/Resources/model_licenses/qnn-asr-models-license-status.csv +695 -0
- package/ios/Resources/model_licenses/tts-models-license-status.csv +596 -0
- package/ios/SherpaOnnx+OnlineSTT.mm +2 -0
- package/ios/SherpaOnnx+PcmLiveStream.mm +2 -29
- package/ios/SherpaOnnx+TTS.mm +179 -20
- package/ios/SherpaOnnx.mm +54 -0
- package/ios/SherpaOnnxAudioConvert.h +10 -0
- package/ios/SherpaOnnxAudioConvert.mm +257 -1
- package/ios/archive/sherpa-onnx-archive-helper.h +3 -0
- package/ios/archive/sherpa-onnx-archive-helper.mm +39 -6
- package/ios/model_detect/sherpa-onnx-model-detect-tts.mm +49 -6
- package/ios/model_detect/sherpa-onnx-model-detect.h +9 -1
- package/ios/model_detect/sherpa-onnx-validate-tts.mm +18 -2
- package/ios/online_stt/sherpa-onnx-online-stt-wrapper.h +1 -0
- package/ios/online_stt/sherpa-onnx-online-stt-wrapper.mm +4 -0
- package/ios/tts/sherpa-onnx-tts-wrapper.h +37 -0
- package/ios/tts/sherpa-onnx-tts-wrapper.mm +158 -3
- package/lib/module/NativeSherpaOnnx.js.map +1 -1
- package/lib/module/audio/index.js +8 -0
- package/lib/module/audio/index.js.map +1 -1
- package/lib/module/download/ModelDownloadManager.js +10 -929
- package/lib/module/download/ModelDownloadManager.js.map +1 -1
- package/lib/module/download/activeModelOperations.js +26 -0
- package/lib/module/download/activeModelOperations.js.map +1 -0
- package/lib/module/download/background-downloader-types.js +2 -0
- package/lib/module/download/background-downloader-types.js.map +1 -0
- package/lib/module/download/bulkPurge.js +72 -0
- package/lib/module/download/bulkPurge.js.map +1 -0
- package/lib/module/download/checksumPrompt.js +19 -0
- package/lib/module/download/checksumPrompt.js.map +1 -0
- package/lib/module/download/constants.js +7 -0
- package/lib/module/download/constants.js.map +1 -0
- package/lib/module/download/downloadEvents.js +35 -0
- package/lib/module/download/downloadEvents.js.map +1 -0
- package/lib/module/download/downloadTask.js +438 -0
- package/lib/module/download/downloadTask.js.map +1 -0
- package/lib/module/download/ensureModel.js +89 -0
- package/lib/module/download/ensureModel.js.map +1 -0
- package/lib/module/download/index.js +4 -4
- package/lib/module/download/index.js.map +1 -1
- package/lib/module/download/localModels.js +151 -0
- package/lib/module/download/localModels.js.map +1 -0
- package/lib/module/download/modelExtraction.js +174 -0
- package/lib/module/download/modelExtraction.js.map +1 -0
- package/lib/module/download/paths.js +98 -0
- package/lib/module/download/paths.js.map +1 -0
- package/lib/module/download/postDownloadProcessing.js +206 -0
- package/lib/module/download/postDownloadProcessing.js.map +1 -0
- package/lib/module/download/protectedModelKeys.js +31 -0
- package/lib/module/download/protectedModelKeys.js.map +1 -0
- package/lib/module/download/registry.js +268 -0
- package/lib/module/download/registry.js.map +1 -0
- package/lib/module/download/retry.js +59 -0
- package/lib/module/download/retry.js.map +1 -0
- package/lib/module/download/types.js +17 -0
- package/lib/module/download/types.js.map +1 -0
- package/lib/module/download/validation.js +101 -5
- package/lib/module/download/validation.js.map +1 -1
- package/lib/module/{download → extraction}/extractTarBz2.js +3 -1
- package/lib/module/extraction/extractTarBz2.js.map +1 -0
- package/lib/module/{download → extraction}/extractTarZst.js +3 -1
- package/lib/module/extraction/extractTarZst.js.map +1 -0
- package/lib/module/extraction/index.js +3 -4
- package/lib/module/extraction/index.js.map +1 -1
- package/lib/module/index.js +1 -1
- package/lib/module/index.js.map +1 -1
- package/lib/module/licenses.js +63 -0
- package/lib/module/licenses.js.map +1 -0
- package/lib/module/stt/index.js +16 -2
- package/lib/module/stt/index.js.map +1 -1
- package/lib/module/stt/streaming.js +2 -0
- package/lib/module/stt/streaming.js.map +1 -1
- package/lib/module/stt/streamingTypes.js.map +1 -1
- package/lib/module/stt/types.js.map +1 -1
- package/lib/module/tts/index.js +21 -3
- package/lib/module/tts/index.js.map +1 -1
- package/lib/module/tts/streaming.js +5 -1
- package/lib/module/tts/streaming.js.map +1 -1
- package/lib/module/tts/types.js +4 -1
- package/lib/module/tts/types.js.map +1 -1
- package/lib/module/utils.js +16 -1
- package/lib/module/utils.js.map +1 -1
- package/lib/typescript/src/NativeSherpaOnnx.d.ts +34 -6
- package/lib/typescript/src/NativeSherpaOnnx.d.ts.map +1 -1
- package/lib/typescript/src/audio/index.d.ts +10 -0
- package/lib/typescript/src/audio/index.d.ts.map +1 -1
- package/lib/typescript/src/download/ModelDownloadManager.d.ts +11 -108
- package/lib/typescript/src/download/ModelDownloadManager.d.ts.map +1 -1
- package/lib/typescript/src/download/activeModelOperations.d.ts +6 -0
- package/lib/typescript/src/download/activeModelOperations.d.ts.map +1 -0
- package/lib/typescript/src/download/background-downloader-types.d.ts +64 -0
- package/lib/typescript/src/download/background-downloader-types.d.ts.map +1 -0
- package/lib/typescript/src/download/bulkPurge.d.ts +14 -0
- package/lib/typescript/src/download/bulkPurge.d.ts.map +1 -0
- package/lib/typescript/src/download/checksumPrompt.d.ts +3 -0
- package/lib/typescript/src/download/checksumPrompt.d.ts.map +1 -0
- package/lib/typescript/src/download/constants.d.ts +5 -0
- package/lib/typescript/src/download/constants.d.ts.map +1 -0
- package/lib/typescript/src/download/downloadEvents.d.ts +6 -0
- package/lib/typescript/src/download/downloadEvents.d.ts.map +1 -0
- package/lib/typescript/src/download/downloadTask.d.ts +30 -0
- package/lib/typescript/src/download/downloadTask.d.ts.map +1 -0
- package/lib/typescript/src/download/ensureModel.d.ts +26 -0
- package/lib/typescript/src/download/ensureModel.d.ts.map +1 -0
- package/lib/typescript/src/download/index.d.ts +7 -7
- package/lib/typescript/src/download/index.d.ts.map +1 -1
- package/lib/typescript/src/download/localModels.d.ts +15 -0
- package/lib/typescript/src/download/localModels.d.ts.map +1 -0
- package/lib/typescript/src/download/modelExtraction.d.ts +36 -0
- package/lib/typescript/src/download/modelExtraction.d.ts.map +1 -0
- package/lib/typescript/src/download/paths.d.ts +28 -0
- package/lib/typescript/src/download/paths.d.ts.map +1 -0
- package/lib/typescript/src/download/postDownloadProcessing.d.ts +19 -0
- package/lib/typescript/src/download/postDownloadProcessing.d.ts.map +1 -0
- package/lib/typescript/src/download/protectedModelKeys.d.ts +6 -0
- package/lib/typescript/src/download/protectedModelKeys.d.ts.map +1 -0
- package/lib/typescript/src/download/registry.d.ts +14 -0
- package/lib/typescript/src/download/registry.d.ts.map +1 -0
- package/lib/typescript/src/download/retry.d.ts +15 -0
- package/lib/typescript/src/download/retry.d.ts.map +1 -0
- package/lib/typescript/src/download/types.d.ts +96 -0
- package/lib/typescript/src/download/types.d.ts.map +1 -0
- package/lib/typescript/src/download/validation.d.ts +19 -0
- package/lib/typescript/src/download/validation.d.ts.map +1 -1
- package/lib/typescript/src/extraction/extractTarBz2.d.ts.map +1 -0
- package/lib/typescript/src/extraction/extractTarZst.d.ts.map +1 -0
- package/lib/typescript/src/index.d.ts +1 -0
- package/lib/typescript/src/index.d.ts.map +1 -1
- package/lib/typescript/src/licenses.d.ts +10 -0
- package/lib/typescript/src/licenses.d.ts.map +1 -0
- package/lib/typescript/src/stt/index.d.ts +4 -1
- package/lib/typescript/src/stt/index.d.ts.map +1 -1
- package/lib/typescript/src/stt/streaming.d.ts.map +1 -1
- package/lib/typescript/src/stt/streamingTypes.d.ts +5 -0
- package/lib/typescript/src/stt/streamingTypes.d.ts.map +1 -1
- package/lib/typescript/src/stt/types.d.ts +3 -1
- package/lib/typescript/src/stt/types.d.ts.map +1 -1
- package/lib/typescript/src/tts/index.d.ts +4 -2
- package/lib/typescript/src/tts/index.d.ts.map +1 -1
- package/lib/typescript/src/tts/streaming.d.ts.map +1 -1
- package/lib/typescript/src/tts/types.d.ts +12 -6
- package/lib/typescript/src/tts/types.d.ts.map +1 -1
- package/lib/typescript/src/utils.d.ts +5 -0
- package/lib/typescript/src/utils.d.ts.map +1 -1
- package/package.json +6 -1
- package/scripts/{check-model-csvs.sh → ci/check-model-csvs.sh} +9 -2
- package/scripts/ci/collect_all_sherpa_model_streams.sh +101 -0
- package/scripts/ci/collect_one_sherpa_release_stream.sh +189 -0
- package/scripts/ci/sherpa_asr_model_release_streams.json +21 -0
- package/scripts/ci/sherpa_tts_model_release_streams.json +13 -0
- package/scripts/ci/update_model_license_csv.sh +765 -0
- package/scripts/setup-ios-framework.sh +14 -11
- package/scripts/update_commercial_use.js +73 -0
- package/src/NativeSherpaOnnx.ts +37 -6
- package/src/audio/index.ts +20 -0
- package/src/download/ModelDownloadManager.ts +57 -1343
- package/src/download/activeModelOperations.ts +38 -0
- package/src/download/background-downloader-types.ts +73 -0
- package/src/download/bulkPurge.ts +102 -0
- package/src/download/checksumPrompt.ts +25 -0
- package/src/download/constants.ts +5 -0
- package/src/download/downloadEvents.ts +55 -0
- package/src/download/downloadTask.ts +565 -0
- package/src/download/ensureModel.ts +124 -0
- package/src/download/index.ts +21 -4
- package/src/download/localModels.ts +234 -0
- package/src/download/modelExtraction.ts +244 -0
- package/src/download/paths.ts +134 -0
- package/src/download/postDownloadProcessing.ts +292 -0
- package/src/download/protectedModelKeys.ts +30 -0
- package/src/download/registry.ts +405 -0
- package/src/download/retry.ts +76 -0
- package/src/download/types.ts +120 -0
- package/src/download/validation.ts +114 -8
- package/src/{download → extraction}/extractTarBz2.ts +3 -1
- package/src/{download → extraction}/extractTarZst.ts +3 -1
- package/src/extraction/index.ts +3 -7
- package/src/index.tsx +1 -0
- package/src/licenses.ts +100 -0
- package/src/stt/index.ts +20 -2
- package/src/stt/streaming.ts +3 -0
- package/src/stt/streamingTypes.ts +5 -0
- package/src/stt/types.ts +3 -1
- package/src/tts/index.ts +33 -2
- package/src/tts/streaming.ts +12 -0
- package/src/tts/types.ts +15 -5
- package/src/utils.ts +22 -1
- package/third_party/sherpa-onnx-prebuilt/ANDROID_RELEASE_TAG +1 -1
- package/third_party/sherpa-onnx-prebuilt/IOS_RELEASE_TAG +1 -1
- package/android/src/main/cpp/jni/tts/sherpa-onnx-tts-zipvoice-jni.cpp +0 -301
- package/android/src/main/java/com/sherpaonnx/ZipvoiceTtsWrapper.kt +0 -187
- package/lib/module/download/extractTarBz2.js.map +0 -1
- package/lib/module/download/extractTarZst.js.map +0 -1
- package/lib/typescript/src/download/extractTarBz2.d.ts.map +0 -1
- package/lib/typescript/src/download/extractTarZst.d.ts.map +0 -1
- package/scripts/check-qnn-support.sh +0 -78
- /package/lib/typescript/src/{download → extraction}/extractTarBz2.d.ts +0 -0
- /package/lib/typescript/src/{download → extraction}/extractTarZst.d.ts +0 -0
|
@@ -29,7 +29,7 @@ get_filename_component(PROJECT_ROOT "${CMAKE_CURRENT_SOURCE_DIR}/../../.." ABSOL
|
|
|
29
29
|
set(USE_FFMPEG ON)
|
|
30
30
|
if(SHERPA_ONNX_DISABLE_FFMPEG)
|
|
31
31
|
set(USE_FFMPEG OFF)
|
|
32
|
-
message(STATUS "FFmpeg disabled (SHERPA_ONNX_DISABLE_FFMPEG=ON). convertAudioToWav16k/convertAudioToFormat will return an error at runtime.")
|
|
32
|
+
message(STATUS "FFmpeg disabled (SHERPA_ONNX_DISABLE_FFMPEG=ON). convertAudioToWav16k/convertAudioToFormat and decode (non-WAV) will return an error at runtime.")
|
|
33
33
|
endif()
|
|
34
34
|
|
|
35
35
|
if(USE_FFMPEG)
|
|
@@ -39,25 +39,32 @@ set(FFMPEG_PREBUILT_BASE "${PROJECT_ROOT}/../third_party/ffmpeg_prebuilt/android
|
|
|
39
39
|
set(FFMPEG_PREBUILT_LIB "${FFMPEG_PREBUILT_BASE}/${ANDROID_ABI}/lib")
|
|
40
40
|
set(FFMPEG_JNILIBS "${PROJECT_ROOT}/src/main/jniLibs/${ANDROID_ABI}")
|
|
41
41
|
set(FFMPEG_INCLUDE_CPP "${PROJECT_ROOT}/src/main/cpp/include/ffmpeg")
|
|
42
|
-
|
|
42
|
+
# Require a real header — an empty third_party/.../include/ dir must not win over Gradle-populated cpp/include/ffmpeg.
|
|
43
|
+
if(EXISTS "${FFMPEG_PREBUILT_BASE}/include/libavcodec/avcodec.h")
|
|
43
44
|
set(FFMPEG_INCLUDE_DIR "${FFMPEG_PREBUILT_BASE}/include")
|
|
44
45
|
message(STATUS "FFmpeg headers: prebuilts ${FFMPEG_INCLUDE_DIR}")
|
|
45
|
-
elseif(EXISTS "${FFMPEG_INCLUDE_CPP}")
|
|
46
|
+
elseif(EXISTS "${FFMPEG_INCLUDE_CPP}/libavcodec/avcodec.h")
|
|
46
47
|
set(FFMPEG_INCLUDE_DIR "${FFMPEG_INCLUDE_CPP}")
|
|
47
|
-
message(STATUS "FFmpeg headers:
|
|
48
|
+
message(STATUS "FFmpeg headers: module tree ${FFMPEG_INCLUDE_DIR}")
|
|
48
49
|
else()
|
|
49
50
|
message(FATAL_ERROR "FFmpeg headers missing. Either:\n"
|
|
50
51
|
" - Build: cd third_party/ffmpeg_prebuilt && ./build_ffmpeg.sh (creates android/include)\n"
|
|
51
52
|
" - Or use a release that includes include/ (Gradle extracts to ${FFMPEG_INCLUDE_CPP})")
|
|
52
53
|
endif()
|
|
54
|
+
# Prebuilts may be either legacy layout android/<abi>/lib/ (build_ffmpeg.sh) or
|
|
55
|
+
# android/jni/<abi>/ (same as Gradle THIRD_PARTY in docs/PREBUILT_RESOLUTION.md).
|
|
56
|
+
set(FFMPEG_PREBUILT_JNI_ABI "${FFMPEG_PREBUILT_BASE}/jni/${ANDROID_ABI}")
|
|
53
57
|
if(EXISTS "${FFMPEG_PREBUILT_LIB}/libavcodec.so")
|
|
54
58
|
set(FFMPEG_LIB_DIR "${FFMPEG_PREBUILT_LIB}")
|
|
55
59
|
message(STATUS "FFmpeg libs: prebuilts ${FFMPEG_LIB_DIR}")
|
|
60
|
+
elseif(EXISTS "${FFMPEG_PREBUILT_JNI_ABI}/libavcodec.so")
|
|
61
|
+
set(FFMPEG_LIB_DIR "${FFMPEG_PREBUILT_JNI_ABI}")
|
|
62
|
+
message(STATUS "FFmpeg libs: prebuilts (jni/<abi>) ${FFMPEG_LIB_DIR}")
|
|
56
63
|
elseif(EXISTS "${FFMPEG_JNILIBS}/libavcodec.so")
|
|
57
64
|
set(FFMPEG_LIB_DIR "${FFMPEG_JNILIBS}")
|
|
58
65
|
message(STATUS "FFmpeg libs: jniLibs ${FFMPEG_LIB_DIR}")
|
|
59
66
|
else()
|
|
60
|
-
message(FATAL_ERROR "FFmpeg libs missing for ABI ${ANDROID_ABI}. Run
|
|
67
|
+
message(FATAL_ERROR "FFmpeg libs missing for ABI ${ANDROID_ABI}. Run a Gradle Android build so prebuilts populate jniLibs, or add local .so under jniLibs / ffmpeg prebuilts paths.")
|
|
61
68
|
endif()
|
|
62
69
|
endif(USE_FFMPEG)
|
|
63
70
|
|
|
@@ -82,7 +89,6 @@ set(SOURCES
|
|
|
82
89
|
jni/model_detect/sherpa-onnx-stt-wrapper.cpp
|
|
83
90
|
jni/model_detect/sherpa-onnx-tts-wrapper.cpp
|
|
84
91
|
jni/audio/sherpa-onnx-audio-convert-jni.cpp
|
|
85
|
-
jni/tts/sherpa-onnx-tts-zipvoice-jni.cpp
|
|
86
92
|
crypto/sha256.cpp
|
|
87
93
|
)
|
|
88
94
|
|
|
@@ -93,10 +99,19 @@ set(LIBARCHIVE_PREBUILT_LIB "${LIBARCHIVE_PREBUILT_BASE}/${ANDROID_ABI}/lib")
|
|
|
93
99
|
set(LIBARCHIVE_JNILIBS "${PROJECT_ROOT}/src/main/jniLibs/${ANDROID_ABI}")
|
|
94
100
|
set(LIBARCHIVE_INCLUDE_CPP "${PROJECT_ROOT}/src/main/cpp/include/libarchive")
|
|
95
101
|
set(USE_LIBARCHIVE_PREBUILT OFF)
|
|
102
|
+
set(LIBARCHIVE_PREBUILT_JNI_ABI "${LIBARCHIVE_PREBUILT_BASE}/jni/${ANDROID_ABI}")
|
|
96
103
|
if(EXISTS "${LIBARCHIVE_PREBUILT_LIB}/libarchive.so")
|
|
97
104
|
set(USE_LIBARCHIVE_PREBUILT ON)
|
|
98
105
|
set(LIBARCHIVE_LIB_DIR "${LIBARCHIVE_PREBUILT_LIB}")
|
|
99
|
-
if(EXISTS "${LIBARCHIVE_PREBUILT_BASE}/include")
|
|
106
|
+
if(EXISTS "${LIBARCHIVE_PREBUILT_BASE}/include/archive.h")
|
|
107
|
+
set(LIBARCHIVE_INCLUDE_DIR "${LIBARCHIVE_PREBUILT_BASE}/include")
|
|
108
|
+
elseif(EXISTS "${LIBARCHIVE_INCLUDE_CPP}/archive.h")
|
|
109
|
+
set(LIBARCHIVE_INCLUDE_DIR "${LIBARCHIVE_INCLUDE_CPP}")
|
|
110
|
+
endif()
|
|
111
|
+
elseif(EXISTS "${LIBARCHIVE_PREBUILT_JNI_ABI}/libarchive.so")
|
|
112
|
+
set(USE_LIBARCHIVE_PREBUILT ON)
|
|
113
|
+
set(LIBARCHIVE_LIB_DIR "${LIBARCHIVE_PREBUILT_JNI_ABI}")
|
|
114
|
+
if(EXISTS "${LIBARCHIVE_PREBUILT_BASE}/include/archive.h")
|
|
100
115
|
set(LIBARCHIVE_INCLUDE_DIR "${LIBARCHIVE_PREBUILT_BASE}/include")
|
|
101
116
|
elseif(EXISTS "${LIBARCHIVE_INCLUDE_CPP}/archive.h")
|
|
102
117
|
set(LIBARCHIVE_INCLUDE_DIR "${LIBARCHIVE_INCLUDE_CPP}")
|
|
@@ -128,15 +143,19 @@ add_library(sherpaonnx SHARED
|
|
|
128
143
|
# If we used IMPORTED here, AGP would also copy .so from CMake --> duplicate in mergeNativeLibs.
|
|
129
144
|
set(SHERPA_ONNX_PREBUILT_BASE "${PROJECT_ROOT}/../third_party/sherpa-onnx-prebuilt/android")
|
|
130
145
|
set(SHERPA_ONNX_ABI_LIB "${SHERPA_ONNX_PREBUILT_BASE}/${ANDROID_ABI}/lib")
|
|
146
|
+
set(SHERPA_ONNX_JNI_ABI "${SHERPA_ONNX_PREBUILT_BASE}/jni/${ANDROID_ABI}")
|
|
131
147
|
set(SHERPA_C_API_LIB_DIR "")
|
|
132
148
|
if(EXISTS "${SHERPA_ONNX_ABI_LIB}/libsherpa-onnx-c-api.so")
|
|
133
149
|
set(SHERPA_C_API_LIB_DIR "${SHERPA_ONNX_ABI_LIB}")
|
|
134
150
|
message(STATUS "sherpa-onnx C-API (link only): ${SHERPA_C_API_LIB_DIR}")
|
|
151
|
+
elseif(EXISTS "${SHERPA_ONNX_JNI_ABI}/libsherpa-onnx-c-api.so")
|
|
152
|
+
set(SHERPA_C_API_LIB_DIR "${SHERPA_ONNX_JNI_ABI}")
|
|
153
|
+
message(STATUS "sherpa-onnx C-API (link only, third_party jni/<abi>): ${SHERPA_C_API_LIB_DIR}")
|
|
135
154
|
elseif(EXISTS "${PROJECT_ROOT}/src/main/jniLibs/${ANDROID_ABI}/libsherpa-onnx-c-api.so")
|
|
136
155
|
set(SHERPA_C_API_LIB_DIR "${PROJECT_ROOT}/src/main/jniLibs/${ANDROID_ABI}")
|
|
137
156
|
message(STATUS "sherpa-onnx C-API (link only, jniLibs): ${SHERPA_C_API_LIB_DIR}")
|
|
138
157
|
else()
|
|
139
|
-
message(WARNING "sherpa-onnx C-API not found.
|
|
158
|
+
message(WARNING "sherpa-onnx C-API not found. Prebuilt sherpa native libs may be incomplete. "
|
|
140
159
|
"Build prebuilts: cd third_party/sherpa-onnx-prebuilt && ./build_sherpa_onnx.sh")
|
|
141
160
|
endif()
|
|
142
161
|
|
|
@@ -148,7 +167,6 @@ target_include_directories(sherpaonnx PRIVATE
|
|
|
148
167
|
${CMAKE_CURRENT_SOURCE_DIR}/jni/archive
|
|
149
168
|
${CMAKE_CURRENT_SOURCE_DIR}/jni/model_detect
|
|
150
169
|
${CMAKE_CURRENT_SOURCE_DIR}/jni/audio
|
|
151
|
-
${CMAKE_CURRENT_SOURCE_DIR}/jni/tts
|
|
152
170
|
${CMAKE_CURRENT_SOURCE_DIR}/include
|
|
153
171
|
)
|
|
154
172
|
if(USE_FFMPEG)
|
|
@@ -167,7 +185,7 @@ if(USE_LIBARCHIVE)
|
|
|
167
185
|
target_compile_definitions(sherpaonnx PRIVATE HAVE_LIBARCHIVE=1)
|
|
168
186
|
endif()
|
|
169
187
|
|
|
170
|
-
# Link libraries (Kotlin API from AAR handles STT/TTS
|
|
188
|
+
# Link libraries (Kotlin API from AAR handles STT/TTS incl. Zipvoice)
|
|
171
189
|
if(USE_FFMPEG)
|
|
172
190
|
target_link_directories(sherpaonnx PRIVATE ${FFMPEG_LIB_DIR})
|
|
173
191
|
endif()
|
|
@@ -157,7 +157,7 @@ bool ArchiveHelper::ExtractTarBz2(
|
|
|
157
157
|
// If target exists and is a directory, extract into it (merge). Otherwise require empty or force-remove.
|
|
158
158
|
if (std::filesystem::exists(target_path)) {
|
|
159
159
|
if (std::filesystem::is_directory(target_path)) {
|
|
160
|
-
// Merge: extract into existing directory (e.g. multiple archives
|
|
160
|
+
// Merge: extract into existing directory (e.g. multiple archives --> same base path)
|
|
161
161
|
} else if (force) {
|
|
162
162
|
std::error_code ec;
|
|
163
163
|
std::filesystem::remove_all(target_path, ec);
|
|
@@ -459,7 +459,7 @@ bool ArchiveHelper::ExtractFromStream(
|
|
|
459
459
|
|
|
460
460
|
if (std::filesystem::exists(target_path)) {
|
|
461
461
|
if (std::filesystem::is_directory(target_path)) {
|
|
462
|
-
// Merge: extract into existing directory (e.g. multiple archives
|
|
462
|
+
// Merge: extract into existing directory (e.g. multiple archives --> same base path)
|
|
463
463
|
} else if (force) {
|
|
464
464
|
std::error_code ec;
|
|
465
465
|
std::filesystem::remove_all(target_path, ec);
|
|
@@ -9,6 +9,7 @@
|
|
|
9
9
|
#include <jni.h>
|
|
10
10
|
#include <string>
|
|
11
11
|
#include <sys/stat.h>
|
|
12
|
+
#include <vector>
|
|
12
13
|
|
|
13
14
|
#define LOG_TAG "AudioConvertJNI"
|
|
14
15
|
#define LOGI(...) __android_log_print(ANDROID_LOG_INFO, LOG_TAG, __VA_ARGS__)
|
|
@@ -24,11 +25,14 @@ extern "C" {
|
|
|
24
25
|
#include <libswresample/swresample.h>
|
|
25
26
|
}
|
|
26
27
|
#include <cstdio>
|
|
27
|
-
#include <vector>
|
|
28
28
|
#endif
|
|
29
29
|
|
|
30
30
|
// Forward declaration — convertToFormat handles all formats including WAV (16 kHz mono).
|
|
31
31
|
static std::string convertToFormat(const char* inputPath, const char* outputPath, const char* formatHint, int outputSampleRateHz);
|
|
32
|
+
static std::string decodeAudioFileToFloatMono(const char* inputPath,
|
|
33
|
+
int targetSampleRateHz,
|
|
34
|
+
std::vector<float>* outSamples,
|
|
35
|
+
int* outSampleRate);
|
|
32
36
|
|
|
33
37
|
// Convenience: convert any audio to 16 kHz mono WAV via the main convertToFormat pipeline.
|
|
34
38
|
static std::string convertToWav16kMono(const char* inputPath, const char* outputPath) {
|
|
@@ -614,7 +618,8 @@ static std::string convertToFormat(const char* inputPath, const char* outputPath
|
|
|
614
618
|
av_packet_unref(pkt);
|
|
615
619
|
continue;
|
|
616
620
|
}
|
|
617
|
-
|
|
621
|
+
const uint8_t* const* in_data = frame->extended_data ? frame->extended_data : frame->data;
|
|
622
|
+
int converted = swr_convert(swr, outData, (int)out_nb_samples, in_data, frame->nb_samples);
|
|
618
623
|
if (converted <= 0) {
|
|
619
624
|
av_freep(&outData[0]);
|
|
620
625
|
av_freep(&outData);
|
|
@@ -701,6 +706,204 @@ static std::string convertToFormat(const char* inputPath, const char* outputPath
|
|
|
701
706
|
#endif
|
|
702
707
|
}
|
|
703
708
|
|
|
709
|
+
// Decode any FFmpeg-supported audio to mono float PCM in [-1,1] (clipping not applied) at outSampleRate.
|
|
710
|
+
static std::string decodeAudioFileToFloatMono(const char* inputPath,
|
|
711
|
+
int targetSampleRateHz,
|
|
712
|
+
std::vector<float>* outSamples,
|
|
713
|
+
int* outSampleRate) {
|
|
714
|
+
outSamples->clear();
|
|
715
|
+
*outSampleRate = 0;
|
|
716
|
+
#ifndef HAVE_FFMPEG
|
|
717
|
+
(void)inputPath;
|
|
718
|
+
(void)targetSampleRateHz;
|
|
719
|
+
return std::string("FFmpeg not available. Build prebuilts with third_party/ffmpeg_prebuilt/build_ffmpeg.ps1 or build_ffmpeg.sh.");
|
|
720
|
+
#else
|
|
721
|
+
if (!inputPath) {
|
|
722
|
+
return std::string("inputPath is null");
|
|
723
|
+
}
|
|
724
|
+
|
|
725
|
+
AVFormatContext* inFmt = nullptr;
|
|
726
|
+
if (avformat_open_input(&inFmt, inputPath, nullptr, nullptr) < 0) {
|
|
727
|
+
LOGE("decodeAudioFileToFloatMono: failed to open inputPath=%s", inputPath);
|
|
728
|
+
return std::string("Failed to open input file");
|
|
729
|
+
}
|
|
730
|
+
if (avformat_find_stream_info(inFmt, nullptr) < 0) {
|
|
731
|
+
avformat_close_input(&inFmt);
|
|
732
|
+
return std::string("Failed to find stream info");
|
|
733
|
+
}
|
|
734
|
+
|
|
735
|
+
int audioStreamIndex = -1;
|
|
736
|
+
for (unsigned i = 0; i < inFmt->nb_streams; ++i) {
|
|
737
|
+
if (inFmt->streams[i]->codecpar->codec_type == AVMEDIA_TYPE_AUDIO) {
|
|
738
|
+
audioStreamIndex = (int)i;
|
|
739
|
+
break;
|
|
740
|
+
}
|
|
741
|
+
}
|
|
742
|
+
if (audioStreamIndex < 0) {
|
|
743
|
+
avformat_close_input(&inFmt);
|
|
744
|
+
return std::string("No audio stream found in input");
|
|
745
|
+
}
|
|
746
|
+
|
|
747
|
+
AVStream* inStream = inFmt->streams[audioStreamIndex];
|
|
748
|
+
const AVCodec* decoder = avcodec_find_decoder(inStream->codecpar->codec_id);
|
|
749
|
+
if (!decoder) {
|
|
750
|
+
avformat_close_input(&inFmt);
|
|
751
|
+
return std::string("Unsupported input codec");
|
|
752
|
+
}
|
|
753
|
+
|
|
754
|
+
AVCodecContext* decCtx = avcodec_alloc_context3(decoder);
|
|
755
|
+
if (!decCtx) {
|
|
756
|
+
avformat_close_input(&inFmt);
|
|
757
|
+
return std::string("Failed to allocate decoder context");
|
|
758
|
+
}
|
|
759
|
+
if (avcodec_parameters_to_context(decCtx, inStream->codecpar) < 0) {
|
|
760
|
+
avcodec_free_context(&decCtx);
|
|
761
|
+
avformat_close_input(&inFmt);
|
|
762
|
+
return std::string("Failed to copy codec parameters");
|
|
763
|
+
}
|
|
764
|
+
if (avcodec_open2(decCtx, decoder, nullptr) < 0) {
|
|
765
|
+
avcodec_free_context(&decCtx);
|
|
766
|
+
avformat_close_input(&inFmt);
|
|
767
|
+
return std::string("Failed to open decoder");
|
|
768
|
+
}
|
|
769
|
+
|
|
770
|
+
int in_sr = decCtx->sample_rate;
|
|
771
|
+
if (inStream->codecpar->sample_rate > 0) {
|
|
772
|
+
in_sr = inStream->codecpar->sample_rate;
|
|
773
|
+
}
|
|
774
|
+
if (in_sr <= 0) {
|
|
775
|
+
avcodec_free_context(&decCtx);
|
|
776
|
+
avformat_close_input(&inFmt);
|
|
777
|
+
return std::string("Invalid input sample rate");
|
|
778
|
+
}
|
|
779
|
+
|
|
780
|
+
int out_sr = (targetSampleRateHz > 0) ? targetSampleRateHz : in_sr;
|
|
781
|
+
if (out_sr <= 0) {
|
|
782
|
+
avcodec_free_context(&decCtx);
|
|
783
|
+
avformat_close_input(&inFmt);
|
|
784
|
+
return std::string("Invalid output sample rate");
|
|
785
|
+
}
|
|
786
|
+
|
|
787
|
+
AVChannelLayout in_layout{};
|
|
788
|
+
if (inStream->codecpar->ch_layout.nb_channels > 0) {
|
|
789
|
+
if (av_channel_layout_copy(&in_layout, &inStream->codecpar->ch_layout) < 0) {
|
|
790
|
+
avcodec_free_context(&decCtx);
|
|
791
|
+
avformat_close_input(&inFmt);
|
|
792
|
+
return std::string("Failed to copy input channel layout");
|
|
793
|
+
}
|
|
794
|
+
} else {
|
|
795
|
+
if (av_channel_layout_copy(&in_layout, &decCtx->ch_layout) < 0) {
|
|
796
|
+
avcodec_free_context(&decCtx);
|
|
797
|
+
avformat_close_input(&inFmt);
|
|
798
|
+
return std::string("Failed to get decoder channel layout");
|
|
799
|
+
}
|
|
800
|
+
}
|
|
801
|
+
|
|
802
|
+
AVChannelLayout out_layout = AV_CHANNEL_LAYOUT_MONO;
|
|
803
|
+
SwrContext* swr = nullptr;
|
|
804
|
+
if (swr_alloc_set_opts2(&swr,
|
|
805
|
+
&out_layout,
|
|
806
|
+
AV_SAMPLE_FMT_FLT,
|
|
807
|
+
out_sr,
|
|
808
|
+
&in_layout,
|
|
809
|
+
decCtx->sample_fmt,
|
|
810
|
+
in_sr,
|
|
811
|
+
0,
|
|
812
|
+
nullptr) < 0 ||
|
|
813
|
+
!swr) {
|
|
814
|
+
av_channel_layout_uninit(&in_layout);
|
|
815
|
+
avcodec_free_context(&decCtx);
|
|
816
|
+
avformat_close_input(&inFmt);
|
|
817
|
+
return std::string("Failed to initialize resampler");
|
|
818
|
+
}
|
|
819
|
+
if (swr_init(swr) < 0) {
|
|
820
|
+
av_channel_layout_uninit(&in_layout);
|
|
821
|
+
swr_free(&swr);
|
|
822
|
+
avcodec_free_context(&decCtx);
|
|
823
|
+
avformat_close_input(&inFmt);
|
|
824
|
+
return std::string("Failed to initialize resampler (swr_init)");
|
|
825
|
+
}
|
|
826
|
+
av_channel_layout_uninit(&in_layout);
|
|
827
|
+
|
|
828
|
+
AVPacket* pkt = av_packet_alloc();
|
|
829
|
+
AVFrame* frame = av_frame_alloc();
|
|
830
|
+
if (!pkt || !frame) {
|
|
831
|
+
if (pkt) av_packet_free(&pkt);
|
|
832
|
+
if (frame) av_frame_free(&frame);
|
|
833
|
+
swr_free(&swr);
|
|
834
|
+
avcodec_free_context(&decCtx);
|
|
835
|
+
avformat_close_input(&inFmt);
|
|
836
|
+
return std::string("Out of memory");
|
|
837
|
+
}
|
|
838
|
+
|
|
839
|
+
auto appendConverted = [&](uint8_t* buf, int nbFloats) {
|
|
840
|
+
if (!buf || nbFloats <= 0) return;
|
|
841
|
+
const float* f = reinterpret_cast<const float*>(buf);
|
|
842
|
+
outSamples->insert(outSamples->end(), f, f + nbFloats);
|
|
843
|
+
};
|
|
844
|
+
|
|
845
|
+
auto convertOneFrame = [&](AVFrame* fr) {
|
|
846
|
+
const uint8_t* const* in_data = fr->extended_data ? fr->extended_data : fr->data;
|
|
847
|
+
int in_sr2 = inStream->codecpar->sample_rate ? inStream->codecpar->sample_rate : decCtx->sample_rate;
|
|
848
|
+
int64_t max_out =
|
|
849
|
+
av_rescale_rnd(swr_get_delay(swr, in_sr2) + (int64_t)fr->nb_samples, out_sr, in_sr2, AV_ROUND_UP);
|
|
850
|
+
if (max_out < 1) max_out = 1;
|
|
851
|
+
uint8_t* out_buf = nullptr;
|
|
852
|
+
if (av_samples_alloc(&out_buf, nullptr, 1, (int)max_out, AV_SAMPLE_FMT_FLT, 0) < 0) {
|
|
853
|
+
return;
|
|
854
|
+
}
|
|
855
|
+
int converted = swr_convert(swr, &out_buf, (int)max_out, in_data, fr->nb_samples);
|
|
856
|
+
if (converted > 0) {
|
|
857
|
+
appendConverted(out_buf, converted);
|
|
858
|
+
}
|
|
859
|
+
av_freep(&out_buf);
|
|
860
|
+
};
|
|
861
|
+
|
|
862
|
+
while (av_read_frame(inFmt, pkt) >= 0) {
|
|
863
|
+
if (pkt->stream_index == audioStreamIndex) {
|
|
864
|
+
if (avcodec_send_packet(decCtx, pkt) == 0) {
|
|
865
|
+
while (avcodec_receive_frame(decCtx, frame) == 0) {
|
|
866
|
+
convertOneFrame(frame);
|
|
867
|
+
av_frame_unref(frame);
|
|
868
|
+
}
|
|
869
|
+
}
|
|
870
|
+
}
|
|
871
|
+
av_packet_unref(pkt);
|
|
872
|
+
}
|
|
873
|
+
|
|
874
|
+
if (avcodec_send_packet(decCtx, nullptr) == 0) {
|
|
875
|
+
while (avcodec_receive_frame(decCtx, frame) == 0) {
|
|
876
|
+
convertOneFrame(frame);
|
|
877
|
+
av_frame_unref(frame);
|
|
878
|
+
}
|
|
879
|
+
}
|
|
880
|
+
|
|
881
|
+
{
|
|
882
|
+
int in_sr2 = inStream->codecpar->sample_rate ? inStream->codecpar->sample_rate : decCtx->sample_rate;
|
|
883
|
+
int tailCap = (int)swr_get_delay(swr, in_sr2) + 4096;
|
|
884
|
+
if (tailCap < 16) tailCap = 16;
|
|
885
|
+
uint8_t* tailData = nullptr;
|
|
886
|
+
if (av_samples_alloc(&tailData, nullptr, 1, tailCap, AV_SAMPLE_FMT_FLT, 0) >= 0) {
|
|
887
|
+
int tailConverted = swr_convert(swr, &tailData, tailCap, nullptr, 0);
|
|
888
|
+
if (tailConverted > 0) {
|
|
889
|
+
appendConverted(tailData, tailConverted);
|
|
890
|
+
}
|
|
891
|
+
av_freep(&tailData);
|
|
892
|
+
}
|
|
893
|
+
}
|
|
894
|
+
|
|
895
|
+
av_packet_free(&pkt);
|
|
896
|
+
av_frame_free(&frame);
|
|
897
|
+
swr_free(&swr);
|
|
898
|
+
avcodec_free_context(&decCtx);
|
|
899
|
+
avformat_close_input(&inFmt);
|
|
900
|
+
|
|
901
|
+
*outSampleRate = out_sr;
|
|
902
|
+
LOGI("decodeAudioFileToFloatMono: samples=%zu sampleRate=%d", outSamples->size(), out_sr);
|
|
903
|
+
return std::string("");
|
|
904
|
+
#endif
|
|
905
|
+
}
|
|
906
|
+
|
|
704
907
|
extern "C" {
|
|
705
908
|
|
|
706
909
|
// Called from Kotlin: SherpaOnnxModule.nativeConvertAudioToWav16k(inputPath, outputPath) -> Boolean
|
|
@@ -759,4 +962,67 @@ Java_com_sherpaonnx_SherpaOnnxModule_nativeConvertAudioToFormat(
|
|
|
759
962
|
return env->NewStringUTF(err.c_str());
|
|
760
963
|
}
|
|
761
964
|
|
|
965
|
+
// Returns Object[]: on error [String message]; on success [float[] samples, Integer sampleRate].
|
|
966
|
+
JNIEXPORT jobjectArray JNICALL
|
|
967
|
+
Java_com_sherpaonnx_SherpaOnnxModule_nativeDecodeAudioFileToFloatSamples(JNIEnv* env,
|
|
968
|
+
jobject /* this */,
|
|
969
|
+
jstring inputPath,
|
|
970
|
+
jint targetSampleRateHz) {
|
|
971
|
+
jclass objectClass = env->FindClass("java/lang/Object");
|
|
972
|
+
if (!objectClass) {
|
|
973
|
+
return nullptr;
|
|
974
|
+
}
|
|
975
|
+
|
|
976
|
+
auto makeError = [&](const char* msg) -> jobjectArray {
|
|
977
|
+
jobjectArray ret = env->NewObjectArray(1, objectClass, nullptr);
|
|
978
|
+
if (!ret) return nullptr;
|
|
979
|
+
jstring jmsg = env->NewStringUTF(msg);
|
|
980
|
+
env->SetObjectArrayElement(ret, 0, jmsg);
|
|
981
|
+
env->DeleteLocalRef(jmsg);
|
|
982
|
+
return ret;
|
|
983
|
+
};
|
|
984
|
+
|
|
985
|
+
if (inputPath == nullptr) {
|
|
986
|
+
return makeError("inputPath must be non-null");
|
|
987
|
+
}
|
|
988
|
+
const char* input = env->GetStringUTFChars(inputPath, nullptr);
|
|
989
|
+
if (input == nullptr) {
|
|
990
|
+
return makeError("Failed to get path string");
|
|
991
|
+
}
|
|
992
|
+
|
|
993
|
+
std::vector<float> samples;
|
|
994
|
+
int sampleRate = 0;
|
|
995
|
+
std::string err = decodeAudioFileToFloatMono(input, (int)targetSampleRateHz, &samples, &sampleRate);
|
|
996
|
+
env->ReleaseStringUTFChars(inputPath, input);
|
|
997
|
+
|
|
998
|
+
if (!err.empty()) {
|
|
999
|
+
return makeError(err.c_str());
|
|
1000
|
+
}
|
|
1001
|
+
|
|
1002
|
+
jfloatArray jfloats = env->NewFloatArray((jsize)samples.size());
|
|
1003
|
+
if (!jfloats) {
|
|
1004
|
+
return makeError("Failed to allocate float array");
|
|
1005
|
+
}
|
|
1006
|
+
if (!samples.empty()) {
|
|
1007
|
+
env->SetFloatArrayRegion(jfloats, 0, (jsize)samples.size(), samples.data());
|
|
1008
|
+
}
|
|
1009
|
+
|
|
1010
|
+
jobjectArray ret = env->NewObjectArray(2, objectClass, nullptr);
|
|
1011
|
+
if (!ret) {
|
|
1012
|
+
env->DeleteLocalRef(jfloats);
|
|
1013
|
+
return makeError("Failed to allocate result array");
|
|
1014
|
+
}
|
|
1015
|
+
env->SetObjectArrayElement(ret, 0, jfloats);
|
|
1016
|
+
|
|
1017
|
+
jclass intCls = env->FindClass("java/lang/Integer");
|
|
1018
|
+
jmethodID intCtor = env->GetMethodID(intCls, "<init>", "(I)V");
|
|
1019
|
+
jobject jrate = env->NewObject(intCls, intCtor, sampleRate);
|
|
1020
|
+
env->SetObjectArrayElement(ret, 1, jrate);
|
|
1021
|
+
|
|
1022
|
+
env->DeleteLocalRef(jfloats);
|
|
1023
|
+
env->DeleteLocalRef(jrate);
|
|
1024
|
+
env->DeleteLocalRef(intCls);
|
|
1025
|
+
return ret;
|
|
1026
|
+
}
|
|
1027
|
+
|
|
762
1028
|
} // extern "C"
|
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
* sherpa-onnx-model-detect-tts.cpp
|
|
3
3
|
*
|
|
4
4
|
* Purpose: Detects TTS model type and fills TtsModelPaths from a model directory. Used by
|
|
5
|
-
* nativeDetectTtsModel (module-jni). Supports Vits, Matcha, Kokoro, Kitten, Pocket, Zipvoice.
|
|
5
|
+
* nativeDetectTtsModel (module-jni). Supports Vits, Matcha, Kokoro, Kitten, Pocket, Zipvoice, Supertonic.
|
|
6
6
|
*
|
|
7
7
|
* --- Detection pipeline (overview) ---
|
|
8
8
|
*
|
|
@@ -56,18 +56,20 @@ TtsModelKind ParseTtsModelType(const std::string& modelType) {
|
|
|
56
56
|
if (modelType == "kitten") return TtsModelKind::kKitten;
|
|
57
57
|
if (modelType == "pocket") return TtsModelKind::kPocket;
|
|
58
58
|
if (modelType == "zipvoice") return TtsModelKind::kZipvoice;
|
|
59
|
+
if (modelType == "supertonic") return TtsModelKind::kSupertonic;
|
|
59
60
|
return TtsModelKind::kUnknown;
|
|
60
61
|
}
|
|
61
62
|
|
|
62
63
|
/** Returns true if the given kind is supported by the current paths and hints (required files present).
|
|
63
|
-
* data_dir (espeak-ng-data) is required
|
|
64
|
-
* VITS
|
|
64
|
+
* data_dir (espeak-ng-data) is required for Kitten, Kokoro, and Zipvoice (Zipvoice uses MatchaTtsLexicon + espeak).
|
|
65
|
+
* VITS and Matcha use dataDir optionally in this detector; Pocket does not use it. */
|
|
65
66
|
static bool CapabilitySupportsTtsKind(
|
|
66
67
|
TtsModelKind kind,
|
|
67
68
|
bool hasVits,
|
|
68
69
|
bool hasMatcha,
|
|
69
70
|
bool hasPocket,
|
|
70
71
|
bool hasZipvoice,
|
|
72
|
+
bool hasSupertonic,
|
|
71
73
|
bool hasVoicesFile,
|
|
72
74
|
bool hasDataDir
|
|
73
75
|
) {
|
|
@@ -83,6 +85,8 @@ static bool CapabilitySupportsTtsKind(
|
|
|
83
85
|
return hasPocket;
|
|
84
86
|
case TtsModelKind::kZipvoice:
|
|
85
87
|
return hasZipvoice;
|
|
88
|
+
case TtsModelKind::kSupertonic:
|
|
89
|
+
return hasSupertonic;
|
|
86
90
|
default:
|
|
87
91
|
return false;
|
|
88
92
|
}
|
|
@@ -108,6 +112,7 @@ static std::vector<TtsModelKind> GetKindsFromDirNameTts(const std::string& model
|
|
|
108
112
|
if (lower.find("matcha") != std::string::npos) add(TtsModelKind::kMatcha);
|
|
109
113
|
if (lower.find("pocket") != std::string::npos) add(TtsModelKind::kPocket);
|
|
110
114
|
if (lower.find("zipvoice") != std::string::npos) add(TtsModelKind::kZipvoice);
|
|
115
|
+
if (lower.find("supertonic") != std::string::npos) add(TtsModelKind::kSupertonic);
|
|
111
116
|
if (lower.find("kokoro") != std::string::npos) add(TtsModelKind::kKokoro);
|
|
112
117
|
if (lower.find("kitten") != std::string::npos) add(TtsModelKind::kKitten);
|
|
113
118
|
if (lower.find("vits") != std::string::npos) add(TtsModelKind::kVits);
|
|
@@ -128,6 +133,10 @@ static TtsDetectResult DetectTtsModelFromFiles(
|
|
|
128
133
|
std::string tokensFile = FindFileByName(files, "tokens.txt");
|
|
129
134
|
std::vector<LexiconCandidate> lexiconCandidates = FindLexiconCandidates(files, modelDir);
|
|
130
135
|
std::string dataDirPath = FindDirectoryUnderRoot(files, modelDir, "espeak-ng-data");
|
|
136
|
+
LOGI("DetectTtsModel: modelDir=%s espeak-ng dataDir=%s (empty=%d)",
|
|
137
|
+
modelDir.c_str(),
|
|
138
|
+
dataDirPath.empty() ? "(empty)" : dataDirPath.c_str(),
|
|
139
|
+
(int)dataDirPath.empty());
|
|
131
140
|
std::string voicesFile = FindFileByName(files, "voices.bin");
|
|
132
141
|
|
|
133
142
|
std::string acousticModel = FindOnnxByAnyToken(files, {"acoustic_model", "acoustic-model"}, std::nullopt);
|
|
@@ -139,16 +148,27 @@ static TtsDetectResult DetectTtsModelFromFiles(
|
|
|
139
148
|
std::string textConditioner = FindOnnxByAnyToken(files, {"text_conditioner", "text-conditioner"}, std::nullopt);
|
|
140
149
|
std::string vocabJsonFile = FindFileByName(files, "vocab.json");
|
|
141
150
|
std::string tokenScoresJsonFile = FindFileByName(files, "token_scores.json");
|
|
151
|
+
std::string durationPredictor = FindOnnxByAnyToken(files, {"duration_predictor", "duration-predictor"}, std::nullopt);
|
|
152
|
+
std::string textEncoderSupertonic = FindOnnxByAnyToken(files, {"text_encoder", "text-encoder"}, std::nullopt);
|
|
153
|
+
std::string vectorEstimator = FindOnnxByAnyToken(files, {"vector_estimator", "vector-estimator"}, std::nullopt);
|
|
154
|
+
std::string ttsJsonFile = FindFileByName(files, "tts.json");
|
|
155
|
+
std::string unicodeIndexerFile = FindFileByName(files, "unicode_indexer.bin");
|
|
156
|
+
std::string voiceStyleFile = FindFileByName(files, "voice.bin");
|
|
142
157
|
|
|
143
158
|
std::vector<std::string> modelExcludes = {
|
|
144
|
-
"acoustic", "vocoder", "encoder", "decoder", "joiner"
|
|
159
|
+
"acoustic", "vocoder", "encoder", "decoder", "joiner",
|
|
160
|
+
// Supertonic component models are not VITS monolithic model.onnx files.
|
|
161
|
+
"duration_predictor", "duration-predictor",
|
|
162
|
+
"text_encoder", "text-encoder",
|
|
163
|
+
"vector_estimator", "vector-estimator"
|
|
145
164
|
};
|
|
146
165
|
std::string ttsModel = FindOnnxByAnyToken(files, {"model"}, std::nullopt);
|
|
147
166
|
if (ttsModel.empty()) {
|
|
148
167
|
ttsModel = FindLargestOnnxExcludingTokens(files, modelExcludes);
|
|
149
168
|
}
|
|
150
169
|
|
|
151
|
-
|
|
170
|
+
// VITS requires both model.onnx-like file and tokens.txt
|
|
171
|
+
bool hasVits = !ttsModel.empty() && !tokensFile.empty();
|
|
152
172
|
std::string modelDirLower = ToLower(modelDir);
|
|
153
173
|
bool isLikelyMatcha = modelDirLower.find("matcha") != std::string::npos;
|
|
154
174
|
bool hasMatcha = (!acousticModel.empty() && !vocoder.empty())
|
|
@@ -165,6 +185,9 @@ static TtsDetectResult DetectTtsModelFromFiles(
|
|
|
165
185
|
}
|
|
166
186
|
bool hasPocket = !lmFlow.empty() && !lmMain.empty() && !encoder.empty() && !decoder.empty() &&
|
|
167
187
|
!textConditioner.empty() && !vocabJsonFile.empty() && !tokenScoresJsonFile.empty();
|
|
188
|
+
bool hasSupertonic = !durationPredictor.empty() && !textEncoderSupertonic.empty() &&
|
|
189
|
+
!vectorEstimator.empty() && !vocoder.empty() && !ttsJsonFile.empty() &&
|
|
190
|
+
!unicodeIndexerFile.empty() && !voiceStyleFile.empty();
|
|
168
191
|
bool hasDataDir = !dataDirPath.empty();
|
|
169
192
|
|
|
170
193
|
bool isLikelyKitten = modelDirLower.find("kitten") != std::string::npos;
|
|
@@ -173,6 +196,7 @@ static TtsDetectResult DetectTtsModelFromFiles(
|
|
|
173
196
|
if (hasMatcha) result.detectedModels.push_back({"matcha", modelDir});
|
|
174
197
|
if (hasPocket) result.detectedModels.push_back({"pocket", modelDir});
|
|
175
198
|
if (hasZipvoice && !hasMatcha) result.detectedModels.push_back({"zipvoice", modelDir});
|
|
199
|
+
if (hasSupertonic) result.detectedModels.push_back({"supertonic", modelDir});
|
|
176
200
|
if (hasVoicesFile) {
|
|
177
201
|
if (isLikelyKitten && !isLikelyKokoro) {
|
|
178
202
|
result.detectedModels.push_back({"kitten", modelDir});
|
|
@@ -201,7 +225,7 @@ static TtsDetectResult DetectTtsModelFromFiles(
|
|
|
201
225
|
std::vector<TtsModelKind> nameCandidates = GetKindsFromDirNameTts(modelDir);
|
|
202
226
|
if (!nameCandidates.empty()) {
|
|
203
227
|
for (TtsModelKind k : nameCandidates) {
|
|
204
|
-
if (CapabilitySupportsTtsKind(k, hasVits, hasMatcha, hasPocket, hasZipvoice,
|
|
228
|
+
if (CapabilitySupportsTtsKind(k, hasVits, hasMatcha, hasPocket, hasZipvoice, hasSupertonic,
|
|
205
229
|
hasVoicesFile, hasDataDir)) {
|
|
206
230
|
selected = k;
|
|
207
231
|
break;
|
|
@@ -212,6 +236,7 @@ static TtsDetectResult DetectTtsModelFromFiles(
|
|
|
212
236
|
if (hasMatcha) selected = TtsModelKind::kMatcha;
|
|
213
237
|
else if (hasPocket) selected = TtsModelKind::kPocket;
|
|
214
238
|
else if (hasZipvoice) selected = TtsModelKind::kZipvoice;
|
|
239
|
+
else if (hasSupertonic) selected = TtsModelKind::kSupertonic;
|
|
215
240
|
else if (hasVoicesFile) {
|
|
216
241
|
if (isLikelyKitten && !isLikelyKokoro) selected = TtsModelKind::kKitten;
|
|
217
242
|
else if (isLikelyKokoro && !isLikelyKitten) selected = TtsModelKind::kKokoro;
|
|
@@ -252,6 +277,12 @@ static TtsDetectResult DetectTtsModelFromFiles(
|
|
|
252
277
|
result.paths.textConditioner = textConditioner;
|
|
253
278
|
result.paths.vocabJson = vocabJsonFile;
|
|
254
279
|
result.paths.tokenScoresJson = tokenScoresJsonFile;
|
|
280
|
+
result.paths.durationPredictor = durationPredictor;
|
|
281
|
+
result.paths.textEncoder = textEncoderSupertonic;
|
|
282
|
+
result.paths.vectorEstimator = vectorEstimator;
|
|
283
|
+
result.paths.ttsJson = ttsJsonFile;
|
|
284
|
+
result.paths.unicodeIndexer = unicodeIndexerFile;
|
|
285
|
+
result.paths.voiceStyle = voiceStyleFile;
|
|
255
286
|
|
|
256
287
|
auto validation = ValidateTtsPaths(selected, result.paths, modelDir);
|
|
257
288
|
if (!validation.ok) {
|
|
@@ -38,7 +38,8 @@ enum class TtsModelKind {
|
|
|
38
38
|
kKokoro,
|
|
39
39
|
kKitten,
|
|
40
40
|
kPocket,
|
|
41
|
-
kZipvoice
|
|
41
|
+
kZipvoice,
|
|
42
|
+
kSupertonic
|
|
42
43
|
};
|
|
43
44
|
|
|
44
45
|
struct SttModelPaths {
|
|
@@ -154,6 +155,13 @@ struct TtsModelPaths {
|
|
|
154
155
|
std::string textConditioner;
|
|
155
156
|
std::string vocabJson;
|
|
156
157
|
std::string tokenScoresJson;
|
|
158
|
+
// Supertonic TTS
|
|
159
|
+
std::string durationPredictor;
|
|
160
|
+
std::string textEncoder;
|
|
161
|
+
std::string vectorEstimator;
|
|
162
|
+
std::string ttsJson;
|
|
163
|
+
std::string unicodeIndexer;
|
|
164
|
+
std::string voiceStyle;
|
|
157
165
|
};
|
|
158
166
|
|
|
159
167
|
struct SttDetectResult {
|
|
@@ -20,6 +20,7 @@ const char* TtsModelKindToString(TtsModelKind k) {
|
|
|
20
20
|
case TtsModelKind::kKitten: return "kitten";
|
|
21
21
|
case TtsModelKind::kPocket: return "pocket";
|
|
22
22
|
case TtsModelKind::kZipvoice: return "zipvoice";
|
|
23
|
+
case TtsModelKind::kSupertonic: return "supertonic";
|
|
23
24
|
default: return "unknown";
|
|
24
25
|
}
|
|
25
26
|
}
|
|
@@ -78,6 +79,12 @@ jobject TtsDetectResultToJava(JNIEnv* env, const TtsDetectResult& result) {
|
|
|
78
79
|
PutString(env, pathsMap, mapPut, "textConditioner", result.paths.textConditioner);
|
|
79
80
|
PutString(env, pathsMap, mapPut, "vocabJson", result.paths.vocabJson);
|
|
80
81
|
PutString(env, pathsMap, mapPut, "tokenScoresJson", result.paths.tokenScoresJson);
|
|
82
|
+
PutString(env, pathsMap, mapPut, "durationPredictor", result.paths.durationPredictor);
|
|
83
|
+
PutString(env, pathsMap, mapPut, "textEncoder", result.paths.textEncoder);
|
|
84
|
+
PutString(env, pathsMap, mapPut, "vectorEstimator", result.paths.vectorEstimator);
|
|
85
|
+
PutString(env, pathsMap, mapPut, "ttsJson", result.paths.ttsJson);
|
|
86
|
+
PutString(env, pathsMap, mapPut, "unicodeIndexer", result.paths.unicodeIndexer);
|
|
87
|
+
PutString(env, pathsMap, mapPut, "voiceStyle", result.paths.voiceStyle);
|
|
81
88
|
jstring keyPaths = env->NewStringUTF("paths");
|
|
82
89
|
env->CallObjectMethod(map, mapPut, keyPaths, pathsMap);
|
|
83
90
|
env->DeleteLocalRef(keyPaths);
|
|
@@ -55,8 +55,18 @@ static const TtsFieldRequirement kZipvoiceReqs[] = {
|
|
|
55
55
|
{"decoder", &TtsModelPaths::decoder, true},
|
|
56
56
|
{"vocoder", &TtsModelPaths::vocoder, true},
|
|
57
57
|
{"tokens", &TtsModelPaths::tokens, true},
|
|
58
|
-
{"dataDir", &TtsModelPaths::dataDir,
|
|
59
|
-
{"lexicon", &TtsModelPaths::lexicon,
|
|
58
|
+
{"dataDir", &TtsModelPaths::dataDir, true},
|
|
59
|
+
{"lexicon", &TtsModelPaths::lexicon, true},
|
|
60
|
+
};
|
|
61
|
+
|
|
62
|
+
static const TtsFieldRequirement kSupertonicReqs[] = {
|
|
63
|
+
{"durationPredictor", &TtsModelPaths::durationPredictor, true},
|
|
64
|
+
{"textEncoder", &TtsModelPaths::textEncoder, true},
|
|
65
|
+
{"vectorEstimator", &TtsModelPaths::vectorEstimator, true},
|
|
66
|
+
{"vocoder", &TtsModelPaths::vocoder, true},
|
|
67
|
+
{"ttsJson", &TtsModelPaths::ttsJson, true},
|
|
68
|
+
{"unicodeIndexer", &TtsModelPaths::unicodeIndexer, true},
|
|
69
|
+
{"voiceStyle", &TtsModelPaths::voiceStyle, true},
|
|
60
70
|
};
|
|
61
71
|
|
|
62
72
|
// ============================================================
|
|
@@ -79,6 +89,9 @@ static const TtsFieldRequirement* GetRequirements(TtsModelKind kind, size_t& cou
|
|
|
79
89
|
case TtsModelKind::kZipvoice:
|
|
80
90
|
count = std::size(kZipvoiceReqs);
|
|
81
91
|
return kZipvoiceReqs;
|
|
92
|
+
case TtsModelKind::kSupertonic:
|
|
93
|
+
count = std::size(kSupertonicReqs);
|
|
94
|
+
return kSupertonicReqs;
|
|
82
95
|
default:
|
|
83
96
|
count = 0;
|
|
84
97
|
return nullptr;
|
|
@@ -93,6 +106,7 @@ static const char* TtsKindToName(TtsModelKind k) {
|
|
|
93
106
|
case TtsModelKind::kKitten: return "Kitten";
|
|
94
107
|
case TtsModelKind::kPocket: return "Pocket";
|
|
95
108
|
case TtsModelKind::kZipvoice: return "Zipvoice";
|
|
109
|
+
case TtsModelKind::kSupertonic: return "Supertonic";
|
|
96
110
|
default: return "Unknown";
|
|
97
111
|
}
|
|
98
112
|
}
|
|
@@ -102,6 +116,8 @@ static const char* GetFieldHint(const char* fieldName) {
|
|
|
102
116
|
return "Copy espeak-ng-data into the model directory.";
|
|
103
117
|
if (std::strcmp(fieldName, "tokens") == 0)
|
|
104
118
|
return "Ensure tokens.txt is present in the model directory.";
|
|
119
|
+
if (std::strcmp(fieldName, "lexicon") == 0)
|
|
120
|
+
return "Add lexicon.txt (or lexicon-<lang>.txt) from the official sherpa-onnx Zipvoice/Matcha release; without it the native engine aborts.";
|
|
105
121
|
return nullptr;
|
|
106
122
|
}
|
|
107
123
|
|