react-native-sherpa-onnx 0.3.6 → 0.3.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +1 -0
- package/README.md +92 -21
- package/SherpaOnnx.podspec +3 -0
- package/THIRD_PARTY_LICENSES/README.md +62 -0
- package/THIRD_PARTY_LICENSES/ffmpeg.txt +502 -0
- package/THIRD_PARTY_LICENSES/libarchive.txt +65 -0
- package/THIRD_PARTY_LICENSES/nvidia_omla.txt +181 -0
- package/THIRD_PARTY_LICENSES/onnxruntime.txt +21 -0
- package/THIRD_PARTY_LICENSES/opus.txt +44 -0
- package/THIRD_PARTY_LICENSES/sherpa-onnx.txt +201 -0
- package/THIRD_PARTY_LICENSES/shine.txt +482 -0
- package/THIRD_PARTY_LICENSES/zstd.txt +30 -0
- package/android/build.gradle +7 -3
- package/android/prebuilt-download.gradle +344 -152
- package/android/prebuilt-versions.gradle +1 -1
- package/android/src/main/assets/model_licenses/asr-models-license-status.csv +409 -0
- package/android/src/main/assets/model_licenses/qnn-asr-models-license-status.csv +695 -0
- package/android/src/main/assets/model_licenses/tts-models-license-status.csv +596 -0
- package/android/src/main/cpp/CMakeLists.txt +28 -10
- package/android/src/main/cpp/jni/archive/sherpa-onnx-archive-helper.cpp +2 -2
- package/android/src/main/cpp/jni/audio/sherpa-onnx-audio-convert-jni.cpp +268 -2
- package/android/src/main/cpp/jni/model_detect/sherpa-onnx-model-detect-tts.cpp +37 -6
- package/android/src/main/cpp/jni/model_detect/sherpa-onnx-model-detect.h +9 -1
- package/android/src/main/cpp/jni/model_detect/sherpa-onnx-tts-wrapper.cpp +7 -0
- package/android/src/main/cpp/jni/model_detect/sherpa-onnx-validate-tts.cpp +18 -2
- package/android/src/main/java/com/sherpaonnx/SherpaOnnxArchiveHelper.kt +40 -10
- package/android/src/main/java/com/sherpaonnx/SherpaOnnxModule.kt +99 -0
- package/android/src/main/java/com/sherpaonnx/SherpaOnnxOnlineSttHelper.kt +4 -1
- package/android/src/main/java/com/sherpaonnx/SherpaOnnxTtsHelper.kt +127 -97
- package/ios/Resources/model_licenses/asr-models-license-status.csv +409 -0
- package/ios/Resources/model_licenses/qnn-asr-models-license-status.csv +695 -0
- package/ios/Resources/model_licenses/tts-models-license-status.csv +596 -0
- package/ios/SherpaOnnx+OnlineSTT.mm +2 -0
- package/ios/SherpaOnnx+PcmLiveStream.mm +2 -29
- package/ios/SherpaOnnx+TTS.mm +179 -20
- package/ios/SherpaOnnx.mm +54 -0
- package/ios/SherpaOnnxAudioConvert.h +10 -0
- package/ios/SherpaOnnxAudioConvert.mm +257 -1
- package/ios/archive/sherpa-onnx-archive-helper.h +3 -0
- package/ios/archive/sherpa-onnx-archive-helper.mm +39 -6
- package/ios/model_detect/sherpa-onnx-model-detect-tts.mm +49 -6
- package/ios/model_detect/sherpa-onnx-model-detect.h +9 -1
- package/ios/model_detect/sherpa-onnx-validate-tts.mm +18 -2
- package/ios/online_stt/sherpa-onnx-online-stt-wrapper.h +1 -0
- package/ios/online_stt/sherpa-onnx-online-stt-wrapper.mm +4 -0
- package/ios/tts/sherpa-onnx-tts-wrapper.h +37 -0
- package/ios/tts/sherpa-onnx-tts-wrapper.mm +158 -3
- package/lib/module/NativeSherpaOnnx.js.map +1 -1
- package/lib/module/audio/index.js +8 -0
- package/lib/module/audio/index.js.map +1 -1
- package/lib/module/download/ModelDownloadManager.js +10 -929
- package/lib/module/download/ModelDownloadManager.js.map +1 -1
- package/lib/module/download/activeModelOperations.js +26 -0
- package/lib/module/download/activeModelOperations.js.map +1 -0
- package/lib/module/download/background-downloader-types.js +2 -0
- package/lib/module/download/background-downloader-types.js.map +1 -0
- package/lib/module/download/bulkPurge.js +72 -0
- package/lib/module/download/bulkPurge.js.map +1 -0
- package/lib/module/download/checksumPrompt.js +19 -0
- package/lib/module/download/checksumPrompt.js.map +1 -0
- package/lib/module/download/constants.js +7 -0
- package/lib/module/download/constants.js.map +1 -0
- package/lib/module/download/downloadEvents.js +35 -0
- package/lib/module/download/downloadEvents.js.map +1 -0
- package/lib/module/download/downloadTask.js +438 -0
- package/lib/module/download/downloadTask.js.map +1 -0
- package/lib/module/download/ensureModel.js +89 -0
- package/lib/module/download/ensureModel.js.map +1 -0
- package/lib/module/download/index.js +4 -4
- package/lib/module/download/index.js.map +1 -1
- package/lib/module/download/localModels.js +151 -0
- package/lib/module/download/localModels.js.map +1 -0
- package/lib/module/download/modelExtraction.js +174 -0
- package/lib/module/download/modelExtraction.js.map +1 -0
- package/lib/module/download/paths.js +98 -0
- package/lib/module/download/paths.js.map +1 -0
- package/lib/module/download/postDownloadProcessing.js +206 -0
- package/lib/module/download/postDownloadProcessing.js.map +1 -0
- package/lib/module/download/protectedModelKeys.js +31 -0
- package/lib/module/download/protectedModelKeys.js.map +1 -0
- package/lib/module/download/registry.js +268 -0
- package/lib/module/download/registry.js.map +1 -0
- package/lib/module/download/retry.js +59 -0
- package/lib/module/download/retry.js.map +1 -0
- package/lib/module/download/types.js +17 -0
- package/lib/module/download/types.js.map +1 -0
- package/lib/module/download/validation.js +101 -5
- package/lib/module/download/validation.js.map +1 -1
- package/lib/module/{download → extraction}/extractTarBz2.js +3 -1
- package/lib/module/extraction/extractTarBz2.js.map +1 -0
- package/lib/module/{download → extraction}/extractTarZst.js +3 -1
- package/lib/module/extraction/extractTarZst.js.map +1 -0
- package/lib/module/extraction/index.js +3 -4
- package/lib/module/extraction/index.js.map +1 -1
- package/lib/module/index.js +1 -1
- package/lib/module/index.js.map +1 -1
- package/lib/module/licenses.js +63 -0
- package/lib/module/licenses.js.map +1 -0
- package/lib/module/stt/index.js +16 -2
- package/lib/module/stt/index.js.map +1 -1
- package/lib/module/stt/streaming.js +2 -0
- package/lib/module/stt/streaming.js.map +1 -1
- package/lib/module/stt/streamingTypes.js.map +1 -1
- package/lib/module/stt/types.js.map +1 -1
- package/lib/module/tts/index.js +21 -3
- package/lib/module/tts/index.js.map +1 -1
- package/lib/module/tts/streaming.js +5 -1
- package/lib/module/tts/streaming.js.map +1 -1
- package/lib/module/tts/types.js +4 -1
- package/lib/module/tts/types.js.map +1 -1
- package/lib/module/utils.js +16 -1
- package/lib/module/utils.js.map +1 -1
- package/lib/typescript/src/NativeSherpaOnnx.d.ts +34 -6
- package/lib/typescript/src/NativeSherpaOnnx.d.ts.map +1 -1
- package/lib/typescript/src/audio/index.d.ts +10 -0
- package/lib/typescript/src/audio/index.d.ts.map +1 -1
- package/lib/typescript/src/download/ModelDownloadManager.d.ts +11 -108
- package/lib/typescript/src/download/ModelDownloadManager.d.ts.map +1 -1
- package/lib/typescript/src/download/activeModelOperations.d.ts +6 -0
- package/lib/typescript/src/download/activeModelOperations.d.ts.map +1 -0
- package/lib/typescript/src/download/background-downloader-types.d.ts +64 -0
- package/lib/typescript/src/download/background-downloader-types.d.ts.map +1 -0
- package/lib/typescript/src/download/bulkPurge.d.ts +14 -0
- package/lib/typescript/src/download/bulkPurge.d.ts.map +1 -0
- package/lib/typescript/src/download/checksumPrompt.d.ts +3 -0
- package/lib/typescript/src/download/checksumPrompt.d.ts.map +1 -0
- package/lib/typescript/src/download/constants.d.ts +5 -0
- package/lib/typescript/src/download/constants.d.ts.map +1 -0
- package/lib/typescript/src/download/downloadEvents.d.ts +6 -0
- package/lib/typescript/src/download/downloadEvents.d.ts.map +1 -0
- package/lib/typescript/src/download/downloadTask.d.ts +30 -0
- package/lib/typescript/src/download/downloadTask.d.ts.map +1 -0
- package/lib/typescript/src/download/ensureModel.d.ts +26 -0
- package/lib/typescript/src/download/ensureModel.d.ts.map +1 -0
- package/lib/typescript/src/download/index.d.ts +7 -7
- package/lib/typescript/src/download/index.d.ts.map +1 -1
- package/lib/typescript/src/download/localModels.d.ts +15 -0
- package/lib/typescript/src/download/localModels.d.ts.map +1 -0
- package/lib/typescript/src/download/modelExtraction.d.ts +36 -0
- package/lib/typescript/src/download/modelExtraction.d.ts.map +1 -0
- package/lib/typescript/src/download/paths.d.ts +28 -0
- package/lib/typescript/src/download/paths.d.ts.map +1 -0
- package/lib/typescript/src/download/postDownloadProcessing.d.ts +19 -0
- package/lib/typescript/src/download/postDownloadProcessing.d.ts.map +1 -0
- package/lib/typescript/src/download/protectedModelKeys.d.ts +6 -0
- package/lib/typescript/src/download/protectedModelKeys.d.ts.map +1 -0
- package/lib/typescript/src/download/registry.d.ts +14 -0
- package/lib/typescript/src/download/registry.d.ts.map +1 -0
- package/lib/typescript/src/download/retry.d.ts +15 -0
- package/lib/typescript/src/download/retry.d.ts.map +1 -0
- package/lib/typescript/src/download/types.d.ts +96 -0
- package/lib/typescript/src/download/types.d.ts.map +1 -0
- package/lib/typescript/src/download/validation.d.ts +19 -0
- package/lib/typescript/src/download/validation.d.ts.map +1 -1
- package/lib/typescript/src/extraction/extractTarBz2.d.ts.map +1 -0
- package/lib/typescript/src/extraction/extractTarZst.d.ts.map +1 -0
- package/lib/typescript/src/index.d.ts +1 -0
- package/lib/typescript/src/index.d.ts.map +1 -1
- package/lib/typescript/src/licenses.d.ts +10 -0
- package/lib/typescript/src/licenses.d.ts.map +1 -0
- package/lib/typescript/src/stt/index.d.ts +4 -1
- package/lib/typescript/src/stt/index.d.ts.map +1 -1
- package/lib/typescript/src/stt/streaming.d.ts.map +1 -1
- package/lib/typescript/src/stt/streamingTypes.d.ts +5 -0
- package/lib/typescript/src/stt/streamingTypes.d.ts.map +1 -1
- package/lib/typescript/src/stt/types.d.ts +3 -1
- package/lib/typescript/src/stt/types.d.ts.map +1 -1
- package/lib/typescript/src/tts/index.d.ts +4 -2
- package/lib/typescript/src/tts/index.d.ts.map +1 -1
- package/lib/typescript/src/tts/streaming.d.ts.map +1 -1
- package/lib/typescript/src/tts/types.d.ts +12 -6
- package/lib/typescript/src/tts/types.d.ts.map +1 -1
- package/lib/typescript/src/utils.d.ts +5 -0
- package/lib/typescript/src/utils.d.ts.map +1 -1
- package/package.json +6 -1
- package/scripts/{check-model-csvs.sh → ci/check-model-csvs.sh} +9 -2
- package/scripts/ci/collect_all_sherpa_model_streams.sh +101 -0
- package/scripts/ci/collect_one_sherpa_release_stream.sh +189 -0
- package/scripts/ci/sherpa_asr_model_release_streams.json +21 -0
- package/scripts/ci/sherpa_tts_model_release_streams.json +13 -0
- package/scripts/ci/update_model_license_csv.sh +765 -0
- package/scripts/setup-ios-framework.sh +14 -11
- package/scripts/update_commercial_use.js +73 -0
- package/src/NativeSherpaOnnx.ts +37 -6
- package/src/audio/index.ts +20 -0
- package/src/download/ModelDownloadManager.ts +57 -1343
- package/src/download/activeModelOperations.ts +38 -0
- package/src/download/background-downloader-types.ts +73 -0
- package/src/download/bulkPurge.ts +102 -0
- package/src/download/checksumPrompt.ts +25 -0
- package/src/download/constants.ts +5 -0
- package/src/download/downloadEvents.ts +55 -0
- package/src/download/downloadTask.ts +565 -0
- package/src/download/ensureModel.ts +124 -0
- package/src/download/index.ts +21 -4
- package/src/download/localModels.ts +234 -0
- package/src/download/modelExtraction.ts +244 -0
- package/src/download/paths.ts +134 -0
- package/src/download/postDownloadProcessing.ts +292 -0
- package/src/download/protectedModelKeys.ts +30 -0
- package/src/download/registry.ts +405 -0
- package/src/download/retry.ts +76 -0
- package/src/download/types.ts +120 -0
- package/src/download/validation.ts +114 -8
- package/src/{download → extraction}/extractTarBz2.ts +3 -1
- package/src/{download → extraction}/extractTarZst.ts +3 -1
- package/src/extraction/index.ts +3 -7
- package/src/index.tsx +1 -0
- package/src/licenses.ts +100 -0
- package/src/stt/index.ts +20 -2
- package/src/stt/streaming.ts +3 -0
- package/src/stt/streamingTypes.ts +5 -0
- package/src/stt/types.ts +3 -1
- package/src/tts/index.ts +33 -2
- package/src/tts/streaming.ts +12 -0
- package/src/tts/types.ts +15 -5
- package/src/utils.ts +22 -1
- package/third_party/sherpa-onnx-prebuilt/ANDROID_RELEASE_TAG +1 -1
- package/third_party/sherpa-onnx-prebuilt/IOS_RELEASE_TAG +1 -1
- package/android/src/main/cpp/jni/tts/sherpa-onnx-tts-zipvoice-jni.cpp +0 -301
- package/android/src/main/java/com/sherpaonnx/ZipvoiceTtsWrapper.kt +0 -187
- package/lib/module/download/extractTarBz2.js.map +0 -1
- package/lib/module/download/extractTarZst.js.map +0 -1
- package/lib/typescript/src/download/extractTarBz2.d.ts.map +0 -1
- package/lib/typescript/src/download/extractTarZst.d.ts.map +0 -1
- package/scripts/check-qnn-support.sh +0 -78
- /package/lib/typescript/src/{download → extraction}/extractTarBz2.d.ts +0 -0
- /package/lib/typescript/src/{download → extraction}/extractTarZst.d.ts +0 -0
|
@@ -2,6 +2,7 @@
|
|
|
2
2
|
#import <React/RCTLog.h>
|
|
3
3
|
#include <string>
|
|
4
4
|
#include <sys/stat.h>
|
|
5
|
+
#include <vector>
|
|
5
6
|
|
|
6
7
|
#ifdef HAVE_FFMPEG
|
|
7
8
|
extern "C" {
|
|
@@ -12,11 +13,14 @@ extern "C" {
|
|
|
12
13
|
#include <libswresample/swresample.h>
|
|
13
14
|
}
|
|
14
15
|
#include <cstdio>
|
|
15
|
-
#include <vector>
|
|
16
16
|
#endif
|
|
17
17
|
|
|
18
18
|
// Forward declaration — convertToFormat handles all formats including WAV (16 kHz mono).
|
|
19
19
|
static std::string convertToFormat(const char* inputPath, const char* outputPath, const char* formatHint, int outputSampleRateHz);
|
|
20
|
+
static std::string decodeAudioFileToFloatMono(const char* inputPath,
|
|
21
|
+
int targetSampleRateHz,
|
|
22
|
+
std::vector<float>* outSamples,
|
|
23
|
+
int* outSampleRate);
|
|
20
24
|
|
|
21
25
|
// Convenience: convert any audio to 16 kHz mono WAV via the main convertToFormat pipeline.
|
|
22
26
|
static std::string convertToWav16kMono(const char* inputPath, const char* outputPath) {
|
|
@@ -659,6 +663,222 @@ static std::string convertToFormat(const char* inputPath, const char* outputPath
|
|
|
659
663
|
#endif
|
|
660
664
|
}
|
|
661
665
|
|
|
666
|
+
static std::string decodeAudioFileToFloatMono(const char* inputPath,
|
|
667
|
+
int targetSampleRateHz,
|
|
668
|
+
std::vector<float>* outSamples,
|
|
669
|
+
int* outSampleRate) {
|
|
670
|
+
outSamples->clear();
|
|
671
|
+
*outSampleRate = 0;
|
|
672
|
+
#ifndef HAVE_FFMPEG
|
|
673
|
+
(void)inputPath;
|
|
674
|
+
(void)targetSampleRateHz;
|
|
675
|
+
return std::string("FFmpeg not available. Build prebuilts with third_party/ffmpeg_prebuilt/build_ffmpeg_ios.sh.");
|
|
676
|
+
#else
|
|
677
|
+
if (!inputPath) {
|
|
678
|
+
return std::string("inputPath is null");
|
|
679
|
+
}
|
|
680
|
+
|
|
681
|
+
AVFormatContext* inFmt = nullptr;
|
|
682
|
+
if (avformat_open_input(&inFmt, inputPath, nullptr, nullptr) < 0) {
|
|
683
|
+
return std::string("Failed to open input file");
|
|
684
|
+
}
|
|
685
|
+
if (avformat_find_stream_info(inFmt, nullptr) < 0) {
|
|
686
|
+
avformat_close_input(&inFmt);
|
|
687
|
+
return std::string("Failed to find stream info");
|
|
688
|
+
}
|
|
689
|
+
|
|
690
|
+
int audioStreamIndex = -1;
|
|
691
|
+
for (unsigned i = 0; i < inFmt->nb_streams; ++i) {
|
|
692
|
+
if (inFmt->streams[i]->codecpar->codec_type == AVMEDIA_TYPE_AUDIO) {
|
|
693
|
+
audioStreamIndex = (int)i;
|
|
694
|
+
break;
|
|
695
|
+
}
|
|
696
|
+
}
|
|
697
|
+
if (audioStreamIndex < 0) {
|
|
698
|
+
avformat_close_input(&inFmt);
|
|
699
|
+
return std::string("No audio stream found in input");
|
|
700
|
+
}
|
|
701
|
+
|
|
702
|
+
AVStream* inStream = inFmt->streams[audioStreamIndex];
|
|
703
|
+
const AVCodec* decoder = avcodec_find_decoder(inStream->codecpar->codec_id);
|
|
704
|
+
if (!decoder) {
|
|
705
|
+
avformat_close_input(&inFmt);
|
|
706
|
+
return std::string("Unsupported input codec");
|
|
707
|
+
}
|
|
708
|
+
|
|
709
|
+
AVCodecContext* decCtx = avcodec_alloc_context3(decoder);
|
|
710
|
+
if (!decCtx) {
|
|
711
|
+
avformat_close_input(&inFmt);
|
|
712
|
+
return std::string("Failed to allocate decoder context");
|
|
713
|
+
}
|
|
714
|
+
if (avcodec_parameters_to_context(decCtx, inStream->codecpar) < 0) {
|
|
715
|
+
avcodec_free_context(&decCtx);
|
|
716
|
+
avformat_close_input(&inFmt);
|
|
717
|
+
return std::string("Failed to copy codec parameters");
|
|
718
|
+
}
|
|
719
|
+
if (avcodec_open2(decCtx, decoder, nullptr) < 0) {
|
|
720
|
+
avcodec_free_context(&decCtx);
|
|
721
|
+
avformat_close_input(&inFmt);
|
|
722
|
+
return std::string("Failed to open decoder");
|
|
723
|
+
}
|
|
724
|
+
|
|
725
|
+
int in_sr = decCtx->sample_rate;
|
|
726
|
+
if (inStream->codecpar->sample_rate > 0) {
|
|
727
|
+
in_sr = inStream->codecpar->sample_rate;
|
|
728
|
+
}
|
|
729
|
+
if (in_sr <= 0) {
|
|
730
|
+
avcodec_free_context(&decCtx);
|
|
731
|
+
avformat_close_input(&inFmt);
|
|
732
|
+
return std::string("Invalid input sample rate");
|
|
733
|
+
}
|
|
734
|
+
|
|
735
|
+
int out_sr = (targetSampleRateHz > 0) ? targetSampleRateHz : in_sr;
|
|
736
|
+
if (out_sr <= 0) {
|
|
737
|
+
avcodec_free_context(&decCtx);
|
|
738
|
+
avformat_close_input(&inFmt);
|
|
739
|
+
return std::string("Invalid output sample rate");
|
|
740
|
+
}
|
|
741
|
+
|
|
742
|
+
AVChannelLayout in_layout{};
|
|
743
|
+
if (inStream->codecpar->ch_layout.nb_channels > 0) {
|
|
744
|
+
if (av_channel_layout_copy(&in_layout, &inStream->codecpar->ch_layout) < 0) {
|
|
745
|
+
avcodec_free_context(&decCtx);
|
|
746
|
+
avformat_close_input(&inFmt);
|
|
747
|
+
return std::string("Failed to copy input channel layout");
|
|
748
|
+
}
|
|
749
|
+
} else {
|
|
750
|
+
if (av_channel_layout_copy(&in_layout, &decCtx->ch_layout) < 0) {
|
|
751
|
+
avcodec_free_context(&decCtx);
|
|
752
|
+
avformat_close_input(&inFmt);
|
|
753
|
+
return std::string("Failed to get decoder channel layout");
|
|
754
|
+
}
|
|
755
|
+
}
|
|
756
|
+
|
|
757
|
+
AVChannelLayout out_layout = AV_CHANNEL_LAYOUT_MONO;
|
|
758
|
+
SwrContext* swr = nullptr;
|
|
759
|
+
if (swr_alloc_set_opts2(&swr,
|
|
760
|
+
&out_layout,
|
|
761
|
+
AV_SAMPLE_FMT_FLT,
|
|
762
|
+
out_sr,
|
|
763
|
+
&in_layout,
|
|
764
|
+
decCtx->sample_fmt,
|
|
765
|
+
in_sr,
|
|
766
|
+
0,
|
|
767
|
+
nullptr) < 0 ||
|
|
768
|
+
!swr) {
|
|
769
|
+
av_channel_layout_uninit(&in_layout);
|
|
770
|
+
avcodec_free_context(&decCtx);
|
|
771
|
+
avformat_close_input(&inFmt);
|
|
772
|
+
return std::string("Failed to initialize resampler");
|
|
773
|
+
}
|
|
774
|
+
if (swr_init(swr) < 0) {
|
|
775
|
+
av_channel_layout_uninit(&in_layout);
|
|
776
|
+
swr_free(&swr);
|
|
777
|
+
avcodec_free_context(&decCtx);
|
|
778
|
+
avformat_close_input(&inFmt);
|
|
779
|
+
return std::string("Failed to initialize resampler (swr_init)");
|
|
780
|
+
}
|
|
781
|
+
av_channel_layout_uninit(&in_layout);
|
|
782
|
+
|
|
783
|
+
AVPacket* pkt = av_packet_alloc();
|
|
784
|
+
AVFrame* frame = av_frame_alloc();
|
|
785
|
+
if (!pkt || !frame) {
|
|
786
|
+
if (pkt) av_packet_free(&pkt);
|
|
787
|
+
if (frame) av_frame_free(&frame);
|
|
788
|
+
swr_free(&swr);
|
|
789
|
+
avcodec_free_context(&decCtx);
|
|
790
|
+
avformat_close_input(&inFmt);
|
|
791
|
+
return std::string("Out of memory");
|
|
792
|
+
}
|
|
793
|
+
|
|
794
|
+
auto appendConverted = [&](uint8_t* buf, int nbFloats) {
|
|
795
|
+
if (!buf || nbFloats <= 0) return;
|
|
796
|
+
const float* f = reinterpret_cast<const float*>(buf);
|
|
797
|
+
outSamples->insert(outSamples->end(), f, f + nbFloats);
|
|
798
|
+
};
|
|
799
|
+
|
|
800
|
+
auto convertOneFrame = [&](AVFrame* fr) {
|
|
801
|
+
// Copy plane pointers so we can pass const uint8_t** to swr_convert without
|
|
802
|
+
// reinterpret_cast(uint8_t** -> const uint8_t**), which triggers -Wcast-qual.
|
|
803
|
+
uint8_t** src = fr->extended_data ? fr->extended_data : fr->data;
|
|
804
|
+
int nplanes = fr->ch_layout.nb_channels;
|
|
805
|
+
if (nplanes <= 0) nplanes = AV_NUM_DATA_POINTERS;
|
|
806
|
+
|
|
807
|
+
const uint8_t* in_stack[AV_NUM_DATA_POINTERS] = {};
|
|
808
|
+
std::vector<const uint8_t*> in_heap;
|
|
809
|
+
const uint8_t** in_arg;
|
|
810
|
+
if (nplanes > AV_NUM_DATA_POINTERS) {
|
|
811
|
+
in_heap.resize(static_cast<size_t>(nplanes));
|
|
812
|
+
for (int i = 0; i < nplanes; ++i) {
|
|
813
|
+
in_heap[static_cast<size_t>(i)] = src[i];
|
|
814
|
+
}
|
|
815
|
+
in_arg = in_heap.data();
|
|
816
|
+
} else {
|
|
817
|
+
for (int i = 0; i < nplanes; ++i) {
|
|
818
|
+
in_stack[i] = src[i];
|
|
819
|
+
}
|
|
820
|
+
in_arg = in_stack;
|
|
821
|
+
}
|
|
822
|
+
|
|
823
|
+
int in_sr2 = inStream->codecpar->sample_rate ? inStream->codecpar->sample_rate : decCtx->sample_rate;
|
|
824
|
+
int64_t max_out =
|
|
825
|
+
av_rescale_rnd(swr_get_delay(swr, in_sr2) + (int64_t)fr->nb_samples, out_sr, in_sr2, AV_ROUND_UP);
|
|
826
|
+
if (max_out < 1) max_out = 1;
|
|
827
|
+
uint8_t* out_buf = nullptr;
|
|
828
|
+
if (av_samples_alloc(&out_buf, nullptr, 1, (int)max_out, AV_SAMPLE_FMT_FLT, 0) < 0) {
|
|
829
|
+
return;
|
|
830
|
+
}
|
|
831
|
+
int converted = swr_convert(swr, &out_buf, (int)max_out, in_arg, fr->nb_samples);
|
|
832
|
+
if (converted > 0) {
|
|
833
|
+
appendConverted(out_buf, converted);
|
|
834
|
+
}
|
|
835
|
+
av_freep(&out_buf);
|
|
836
|
+
};
|
|
837
|
+
|
|
838
|
+
while (av_read_frame(inFmt, pkt) >= 0) {
|
|
839
|
+
if (pkt->stream_index == audioStreamIndex) {
|
|
840
|
+
if (avcodec_send_packet(decCtx, pkt) == 0) {
|
|
841
|
+
while (avcodec_receive_frame(decCtx, frame) == 0) {
|
|
842
|
+
convertOneFrame(frame);
|
|
843
|
+
av_frame_unref(frame);
|
|
844
|
+
}
|
|
845
|
+
}
|
|
846
|
+
}
|
|
847
|
+
av_packet_unref(pkt);
|
|
848
|
+
}
|
|
849
|
+
|
|
850
|
+
if (avcodec_send_packet(decCtx, nullptr) == 0) {
|
|
851
|
+
while (avcodec_receive_frame(decCtx, frame) == 0) {
|
|
852
|
+
convertOneFrame(frame);
|
|
853
|
+
av_frame_unref(frame);
|
|
854
|
+
}
|
|
855
|
+
}
|
|
856
|
+
|
|
857
|
+
{
|
|
858
|
+
int in_sr2 = inStream->codecpar->sample_rate ? inStream->codecpar->sample_rate : decCtx->sample_rate;
|
|
859
|
+
int tailCap = (int)swr_get_delay(swr, in_sr2) + 4096;
|
|
860
|
+
if (tailCap < 16) tailCap = 16;
|
|
861
|
+
uint8_t* tailData = nullptr;
|
|
862
|
+
if (av_samples_alloc(&tailData, nullptr, 1, tailCap, AV_SAMPLE_FMT_FLT, 0) >= 0) {
|
|
863
|
+
int tailConverted = swr_convert(swr, &tailData, tailCap, nullptr, 0);
|
|
864
|
+
if (tailConverted > 0) {
|
|
865
|
+
appendConverted(tailData, tailConverted);
|
|
866
|
+
}
|
|
867
|
+
av_freep(&tailData);
|
|
868
|
+
}
|
|
869
|
+
}
|
|
870
|
+
|
|
871
|
+
av_packet_free(&pkt);
|
|
872
|
+
av_frame_free(&frame);
|
|
873
|
+
swr_free(&swr);
|
|
874
|
+
avcodec_free_context(&decCtx);
|
|
875
|
+
avformat_close_input(&inFmt);
|
|
876
|
+
|
|
877
|
+
*outSampleRate = out_sr;
|
|
878
|
+
return std::string("");
|
|
879
|
+
#endif
|
|
880
|
+
}
|
|
881
|
+
|
|
662
882
|
@implementation SherpaOnnxAudioConvert
|
|
663
883
|
|
|
664
884
|
+ (BOOL)convertAudioToWav16k:(NSString *)inputPath
|
|
@@ -695,4 +915,40 @@ static std::string convertToFormat(const char* inputPath, const char* outputPath
|
|
|
695
915
|
return YES;
|
|
696
916
|
}
|
|
697
917
|
|
|
918
|
+
+ (BOOL)decodeAudioFileToFloatSamples:(NSString *)inputPath
|
|
919
|
+
targetSampleRateHz:(int)targetSampleRateHz
|
|
920
|
+
outSamples:(NSArray<NSNumber *> **)outSamples
|
|
921
|
+
outSampleRate:(int *)outSampleRate
|
|
922
|
+
error:(NSError **)error
|
|
923
|
+
{
|
|
924
|
+
if (!outSamples || !outSampleRate) {
|
|
925
|
+
if (error) {
|
|
926
|
+
*error = [NSError errorWithDomain:@"SherpaOnnxAudioConvert"
|
|
927
|
+
code:-2
|
|
928
|
+
userInfo:@{NSLocalizedDescriptionKey: @"outSamples/outSampleRate required"}];
|
|
929
|
+
}
|
|
930
|
+
return NO;
|
|
931
|
+
}
|
|
932
|
+
*outSamples = nil;
|
|
933
|
+
*outSampleRate = 0;
|
|
934
|
+
std::vector<float> v;
|
|
935
|
+
int sr = 0;
|
|
936
|
+
std::string err = decodeAudioFileToFloatMono(inputPath.UTF8String, targetSampleRateHz, &v, &sr);
|
|
937
|
+
if (!err.empty()) {
|
|
938
|
+
if (error) {
|
|
939
|
+
*error = [NSError errorWithDomain:@"SherpaOnnxAudioConvert"
|
|
940
|
+
code:-1
|
|
941
|
+
userInfo:@{NSLocalizedDescriptionKey: [NSString stringWithUTF8String:err.c_str()]}];
|
|
942
|
+
}
|
|
943
|
+
return NO;
|
|
944
|
+
}
|
|
945
|
+
NSMutableArray<NSNumber *> *arr = [NSMutableArray arrayWithCapacity:v.size()];
|
|
946
|
+
for (size_t i = 0; i < v.size(); ++i) {
|
|
947
|
+
[arr addObject:@(v[i])];
|
|
948
|
+
}
|
|
949
|
+
*outSamples = arr;
|
|
950
|
+
*outSampleRate = sr;
|
|
951
|
+
return YES;
|
|
952
|
+
}
|
|
953
|
+
|
|
698
954
|
@end
|
|
@@ -23,6 +23,9 @@ typedef void (^SherpaOnnxArchiveProgressBlock)(long long bytes, long long totalB
|
|
|
23
23
|
|
|
24
24
|
+ (void)cancelExtractTarZst;
|
|
25
25
|
|
|
26
|
+
/** Cancel extraction for a specific source archive path (per-operation cancel for parallel extractions). */
|
|
27
|
+
+ (void)cancelExtractForPath:(NSString *)sourcePath;
|
|
28
|
+
|
|
26
29
|
@end
|
|
27
30
|
|
|
28
31
|
NS_ASSUME_NONNULL_END
|
|
@@ -14,9 +14,24 @@
|
|
|
14
14
|
#include <array>
|
|
15
15
|
#include <atomic>
|
|
16
16
|
#include <cstdio>
|
|
17
|
+
#include <mutex>
|
|
18
|
+
#include <set>
|
|
17
19
|
#include <string>
|
|
18
20
|
|
|
19
|
-
static std::
|
|
21
|
+
static std::mutex g_cancelMutex;
|
|
22
|
+
static std::set<std::string> g_cancelledPaths;
|
|
23
|
+
|
|
24
|
+
static bool isPathCancelled(const std::string& path) {
|
|
25
|
+
std::lock_guard<std::mutex> lock(g_cancelMutex);
|
|
26
|
+
// If the set contains an empty string, ALL extractions are cancelled (legacy global cancel).
|
|
27
|
+
return g_cancelledPaths.count("") > 0 || g_cancelledPaths.count(path) > 0;
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
static void clearCancelForPath(const std::string& path) {
|
|
31
|
+
std::lock_guard<std::mutex> lock(g_cancelMutex);
|
|
32
|
+
g_cancelledPaths.erase(path);
|
|
33
|
+
g_cancelledPaths.erase(""); // Clear the global cancel flag too
|
|
34
|
+
}
|
|
20
35
|
|
|
21
36
|
namespace {
|
|
22
37
|
#ifdef HAVE_LIBARCHIVE
|
|
@@ -127,7 +142,8 @@ static NSString* ComputeFileSha256(NSString* filePath, NSError** error) {
|
|
|
127
142
|
+ (void)cancelExtractTarBz2
|
|
128
143
|
{
|
|
129
144
|
#ifdef HAVE_LIBARCHIVE
|
|
130
|
-
|
|
145
|
+
std::lock_guard<std::mutex> lock(g_cancelMutex);
|
|
146
|
+
g_cancelledPaths.insert(""); // Empty string = cancel ALL
|
|
131
147
|
#else
|
|
132
148
|
// feature disabled
|
|
133
149
|
#endif
|
|
@@ -136,7 +152,21 @@ static NSString* ComputeFileSha256(NSString* filePath, NSError** error) {
|
|
|
136
152
|
+ (void)cancelExtractTarZst
|
|
137
153
|
{
|
|
138
154
|
#ifdef HAVE_LIBARCHIVE
|
|
139
|
-
|
|
155
|
+
std::lock_guard<std::mutex> lock(g_cancelMutex);
|
|
156
|
+
g_cancelledPaths.insert(""); // Empty string = cancel ALL
|
|
157
|
+
#else
|
|
158
|
+
// feature disabled
|
|
159
|
+
#endif
|
|
160
|
+
}
|
|
161
|
+
|
|
162
|
+
+ (void)cancelExtractForPath:(NSString *)sourcePath
|
|
163
|
+
{
|
|
164
|
+
#ifdef HAVE_LIBARCHIVE
|
|
165
|
+
std::string path = [sourcePath UTF8String] ?: "";
|
|
166
|
+
if (!path.empty()) {
|
|
167
|
+
std::lock_guard<std::mutex> lock(g_cancelMutex);
|
|
168
|
+
g_cancelledPaths.insert(path);
|
|
169
|
+
}
|
|
140
170
|
#else
|
|
141
171
|
// feature disabled
|
|
142
172
|
#endif
|
|
@@ -150,7 +180,8 @@ static NSString* ComputeFileSha256(NSString* filePath, NSError** error) {
|
|
|
150
180
|
#ifndef HAVE_LIBARCHIVE
|
|
151
181
|
return @{ @"success": @NO, @"reason": @"libarchive is disabled in this build. Rebuild without SHERPA_ONNX_DISABLE_LIBARCHIVE=1." };
|
|
152
182
|
#else
|
|
153
|
-
|
|
183
|
+
std::string sourcePathStr = [sourcePath UTF8String] ?: "";
|
|
184
|
+
clearCancelForPath(sourcePathStr);
|
|
154
185
|
NSFileManager *fileManager = [NSFileManager defaultManager];
|
|
155
186
|
|
|
156
187
|
if (![fileManager fileExistsAtPath:sourcePath]) {
|
|
@@ -213,10 +244,11 @@ static NSString* ComputeFileSha256(NSString* filePath, NSError** error) {
|
|
|
213
244
|
int lastPercent = -1;
|
|
214
245
|
long long lastEmitBytes = 0;
|
|
215
246
|
while ((result = archive_read_next_header(archive, &entry)) == ARCHIVE_OK) {
|
|
216
|
-
if (
|
|
247
|
+
if (isPathCancelled(sourcePathStr)) {
|
|
217
248
|
archive_read_free(archive);
|
|
218
249
|
archive_write_free(disk);
|
|
219
250
|
close_reader();
|
|
251
|
+
clearCancelForPath(sourcePathStr);
|
|
220
252
|
return @{ @"success": @NO, @"reason": @"Extraction cancelled" };
|
|
221
253
|
}
|
|
222
254
|
const char *currentPath = archive_entry_pathname(entry);
|
|
@@ -245,10 +277,11 @@ static NSString* ComputeFileSha256(NSString* filePath, NSError** error) {
|
|
|
245
277
|
size_t size = 0;
|
|
246
278
|
la_int64_t offset = 0;
|
|
247
279
|
while ((result = archive_read_data_block(archive, &buff, &size, &offset)) == ARCHIVE_OK) {
|
|
248
|
-
if (
|
|
280
|
+
if (isPathCancelled(sourcePathStr)) {
|
|
249
281
|
archive_read_free(archive);
|
|
250
282
|
archive_write_free(disk);
|
|
251
283
|
close_reader();
|
|
284
|
+
clearCancelForPath(sourcePathStr);
|
|
252
285
|
return @{ @"success": @NO, @"reason": @"Extraction cancelled" };
|
|
253
286
|
}
|
|
254
287
|
la_ssize_t writeResult = archive_write_data_block(disk, buff, size, offset);
|
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
* sherpa-onnx-model-detect-tts.mm
|
|
3
3
|
*
|
|
4
4
|
* Purpose: Detects TTS (text-to-speech) model type and fills TtsModelPaths from a model directory.
|
|
5
|
-
* Used by the TTS wrapper on iOS. Supports Vits, Matcha, Kokoro, Kitten, Pocket, Zipvoice.
|
|
5
|
+
* Used by the TTS wrapper on iOS. Supports Vits, Matcha, Kokoro, Kitten, Pocket, Zipvoice, Supertonic.
|
|
6
6
|
*
|
|
7
7
|
* --- Detection pipeline (overview) ---
|
|
8
8
|
*
|
|
@@ -39,6 +39,13 @@
|
|
|
39
39
|
#include <string>
|
|
40
40
|
#include <vector>
|
|
41
41
|
|
|
42
|
+
#if defined(__APPLE__)
|
|
43
|
+
#include <Foundation/Foundation.h>
|
|
44
|
+
#define TTS_DETECT_LOGI(fmt, ...) NSLog(@"[TtsModelDetect] " fmt, ##__VA_ARGS__)
|
|
45
|
+
#else
|
|
46
|
+
#define TTS_DETECT_LOGI(fmt, ...) ((void)0)
|
|
47
|
+
#endif
|
|
48
|
+
|
|
42
49
|
namespace sherpaonnx {
|
|
43
50
|
namespace {
|
|
44
51
|
|
|
@@ -51,18 +58,20 @@ TtsModelKind ParseTtsModelType(const std::string& modelType) {
|
|
|
51
58
|
if (modelType == "kitten") return TtsModelKind::kKitten;
|
|
52
59
|
if (modelType == "pocket") return TtsModelKind::kPocket;
|
|
53
60
|
if (modelType == "zipvoice") return TtsModelKind::kZipvoice;
|
|
61
|
+
if (modelType == "supertonic") return TtsModelKind::kSupertonic;
|
|
54
62
|
return TtsModelKind::kUnknown;
|
|
55
63
|
}
|
|
56
64
|
|
|
57
65
|
/** Returns true if the given kind is supported by the current paths and hints (required files present).
|
|
58
|
-
* data_dir (espeak-ng-data) is required
|
|
59
|
-
* VITS
|
|
66
|
+
* data_dir (espeak-ng-data) is required for Kitten, Kokoro, and Zipvoice (Zipvoice uses MatchaTtsLexicon + espeak).
|
|
67
|
+
* VITS and Matcha use dataDir optionally in this detector; Pocket does not use it. */
|
|
60
68
|
static bool CapabilitySupportsTtsKind(
|
|
61
69
|
TtsModelKind kind,
|
|
62
70
|
bool hasVits,
|
|
63
71
|
bool hasMatcha,
|
|
64
72
|
bool hasPocket,
|
|
65
73
|
bool hasZipvoice,
|
|
74
|
+
bool hasSupertonic,
|
|
66
75
|
bool hasVoicesFile,
|
|
67
76
|
bool hasDataDir
|
|
68
77
|
) {
|
|
@@ -78,6 +87,8 @@ static bool CapabilitySupportsTtsKind(
|
|
|
78
87
|
return hasPocket;
|
|
79
88
|
case TtsModelKind::kZipvoice:
|
|
80
89
|
return hasZipvoice;
|
|
90
|
+
case TtsModelKind::kSupertonic:
|
|
91
|
+
return hasSupertonic;
|
|
81
92
|
default:
|
|
82
93
|
return false;
|
|
83
94
|
}
|
|
@@ -102,6 +113,7 @@ static std::vector<TtsModelKind> GetKindsFromDirNameTts(const std::string& model
|
|
|
102
113
|
if (lower.find("matcha") != std::string::npos) add(TtsModelKind::kMatcha);
|
|
103
114
|
if (lower.find("pocket") != std::string::npos) add(TtsModelKind::kPocket);
|
|
104
115
|
if (lower.find("zipvoice") != std::string::npos) add(TtsModelKind::kZipvoice);
|
|
116
|
+
if (lower.find("supertonic") != std::string::npos) add(TtsModelKind::kSupertonic);
|
|
105
117
|
if (lower.find("kokoro") != std::string::npos) add(TtsModelKind::kKokoro);
|
|
106
118
|
if (lower.find("kitten") != std::string::npos) add(TtsModelKind::kKitten);
|
|
107
119
|
if (lower.find("vits") != std::string::npos) add(TtsModelKind::kVits);
|
|
@@ -132,6 +144,10 @@ TtsDetectResult DetectTtsModel(const std::string& modelDir, const std::string& m
|
|
|
132
144
|
std::string tokensFile = FindFileByName(files, "tokens.txt");
|
|
133
145
|
std::vector<LexiconCandidate> lexiconCandidates = FindLexiconCandidates(files, modelDir);
|
|
134
146
|
std::string dataDirPath = FindDirectoryUnderRoot(files, modelDir, "espeak-ng-data");
|
|
147
|
+
TTS_DETECT_LOGI("DetectTtsModel: modelDir=%s espeak-ng dataDir=%s (empty=%d)",
|
|
148
|
+
modelDir.c_str(),
|
|
149
|
+
dataDirPath.empty() ? "(empty)" : dataDirPath.c_str(),
|
|
150
|
+
(int)dataDirPath.empty());
|
|
135
151
|
std::string voicesFile = FindFileByName(files, "voices.bin");
|
|
136
152
|
|
|
137
153
|
std::string acousticModel = FindOnnxByAnyToken(files, {"acoustic_model", "acoustic-model"}, std::nullopt);
|
|
@@ -143,14 +159,27 @@ TtsDetectResult DetectTtsModel(const std::string& modelDir, const std::string& m
|
|
|
143
159
|
std::string textConditioner = FindOnnxByAnyToken(files, {"text_conditioner", "text-conditioner"}, std::nullopt);
|
|
144
160
|
std::string vocabJsonFile = FindFileByName(files, "vocab.json");
|
|
145
161
|
std::string tokenScoresJsonFile = FindFileByName(files, "token_scores.json");
|
|
162
|
+
std::string durationPredictor = FindOnnxByAnyToken(files, {"duration_predictor", "duration-predictor"}, std::nullopt);
|
|
163
|
+
std::string textEncoderSupertonic = FindOnnxByAnyToken(files, {"text_encoder", "text-encoder"}, std::nullopt);
|
|
164
|
+
std::string vectorEstimator = FindOnnxByAnyToken(files, {"vector_estimator", "vector-estimator"}, std::nullopt);
|
|
165
|
+
std::string ttsJsonFile = FindFileByName(files, "tts.json");
|
|
166
|
+
std::string unicodeIndexerFile = FindFileByName(files, "unicode_indexer.bin");
|
|
167
|
+
std::string voiceStyleFile = FindFileByName(files, "voice.bin");
|
|
146
168
|
|
|
147
|
-
std::vector<std::string> modelExcludes = {
|
|
169
|
+
std::vector<std::string> modelExcludes = {
|
|
170
|
+
"acoustic", "vocoder", "encoder", "decoder", "joiner",
|
|
171
|
+
// Supertonic component models are not VITS monolithic model.onnx files.
|
|
172
|
+
"duration_predictor", "duration-predictor",
|
|
173
|
+
"text_encoder", "text-encoder",
|
|
174
|
+
"vector_estimator", "vector-estimator"
|
|
175
|
+
};
|
|
148
176
|
std::string ttsModel = FindOnnxByAnyToken(files, {"model"}, std::nullopt);
|
|
149
177
|
if (ttsModel.empty()) {
|
|
150
178
|
ttsModel = FindLargestOnnxExcludingTokens(files, modelExcludes);
|
|
151
179
|
}
|
|
152
180
|
|
|
153
|
-
|
|
181
|
+
// VITS requires both model.onnx-like file and tokens.txt
|
|
182
|
+
bool hasVits = !ttsModel.empty() && !tokensFile.empty();
|
|
154
183
|
std::string modelDirLower = ToLower(modelDir);
|
|
155
184
|
bool isLikelyMatcha = modelDirLower.find("matcha") != std::string::npos;
|
|
156
185
|
bool hasMatcha = (!acousticModel.empty() && !vocoder.empty())
|
|
@@ -167,6 +196,9 @@ TtsDetectResult DetectTtsModel(const std::string& modelDir, const std::string& m
|
|
|
167
196
|
}
|
|
168
197
|
bool hasPocket = !lmFlow.empty() && !lmMain.empty() && !encoder.empty() && !decoder.empty() &&
|
|
169
198
|
!textConditioner.empty() && !vocabJsonFile.empty() && !tokenScoresJsonFile.empty();
|
|
199
|
+
bool hasSupertonic = !durationPredictor.empty() && !textEncoderSupertonic.empty() &&
|
|
200
|
+
!vectorEstimator.empty() && !vocoder.empty() && !ttsJsonFile.empty() &&
|
|
201
|
+
!unicodeIndexerFile.empty() && !voiceStyleFile.empty();
|
|
170
202
|
bool hasDataDir = !dataDirPath.empty();
|
|
171
203
|
|
|
172
204
|
bool isLikelyKitten = modelDirLower.find("kitten") != std::string::npos;
|
|
@@ -181,6 +213,9 @@ TtsDetectResult DetectTtsModel(const std::string& modelDir, const std::string& m
|
|
|
181
213
|
if (hasZipvoice && !hasMatcha) {
|
|
182
214
|
result.detectedModels.push_back({"zipvoice", modelDir});
|
|
183
215
|
}
|
|
216
|
+
if (hasSupertonic) {
|
|
217
|
+
result.detectedModels.push_back({"supertonic", modelDir});
|
|
218
|
+
}
|
|
184
219
|
if (hasVoicesFile) {
|
|
185
220
|
if (isLikelyKitten && !isLikelyKokoro) {
|
|
186
221
|
result.detectedModels.push_back({"kitten", modelDir});
|
|
@@ -217,7 +252,7 @@ TtsDetectResult DetectTtsModel(const std::string& modelDir, const std::string& m
|
|
|
217
252
|
std::vector<TtsModelKind> nameCandidates = GetKindsFromDirNameTts(modelDir);
|
|
218
253
|
if (!nameCandidates.empty()) {
|
|
219
254
|
for (TtsModelKind k : nameCandidates) {
|
|
220
|
-
if (CapabilitySupportsTtsKind(k, hasVits, hasMatcha, hasPocket, hasZipvoice,
|
|
255
|
+
if (CapabilitySupportsTtsKind(k, hasVits, hasMatcha, hasPocket, hasZipvoice, hasSupertonic,
|
|
221
256
|
hasVoicesFile, hasDataDir)) {
|
|
222
257
|
selected = k;
|
|
223
258
|
break;
|
|
@@ -232,6 +267,8 @@ TtsDetectResult DetectTtsModel(const std::string& modelDir, const std::string& m
|
|
|
232
267
|
selected = TtsModelKind::kPocket;
|
|
233
268
|
} else if (hasZipvoice) {
|
|
234
269
|
selected = TtsModelKind::kZipvoice;
|
|
270
|
+
} else if (hasSupertonic) {
|
|
271
|
+
selected = TtsModelKind::kSupertonic;
|
|
235
272
|
} else if (hasVoicesFile) {
|
|
236
273
|
if (isLikelyKitten && !isLikelyKokoro) {
|
|
237
274
|
selected = TtsModelKind::kKitten;
|
|
@@ -278,6 +315,12 @@ TtsDetectResult DetectTtsModel(const std::string& modelDir, const std::string& m
|
|
|
278
315
|
result.paths.textConditioner = textConditioner;
|
|
279
316
|
result.paths.vocabJson = vocabJsonFile;
|
|
280
317
|
result.paths.tokenScoresJson = tokenScoresJsonFile;
|
|
318
|
+
result.paths.durationPredictor = durationPredictor;
|
|
319
|
+
result.paths.textEncoder = textEncoderSupertonic;
|
|
320
|
+
result.paths.vectorEstimator = vectorEstimator;
|
|
321
|
+
result.paths.ttsJson = ttsJsonFile;
|
|
322
|
+
result.paths.unicodeIndexer = unicodeIndexerFile;
|
|
323
|
+
result.paths.voiceStyle = voiceStyleFile;
|
|
281
324
|
|
|
282
325
|
auto validation = ValidateTtsPaths(selected, result.paths, modelDir);
|
|
283
326
|
if (!validation.ok) {
|
|
@@ -37,7 +37,8 @@ enum class TtsModelKind {
|
|
|
37
37
|
kKokoro,
|
|
38
38
|
kKitten,
|
|
39
39
|
kPocket,
|
|
40
|
-
kZipvoice
|
|
40
|
+
kZipvoice,
|
|
41
|
+
kSupertonic
|
|
41
42
|
};
|
|
42
43
|
|
|
43
44
|
struct SttModelPaths {
|
|
@@ -150,6 +151,13 @@ struct TtsModelPaths {
|
|
|
150
151
|
std::string textConditioner;
|
|
151
152
|
std::string vocabJson;
|
|
152
153
|
std::string tokenScoresJson;
|
|
154
|
+
// Supertonic TTS
|
|
155
|
+
std::string durationPredictor;
|
|
156
|
+
std::string textEncoder;
|
|
157
|
+
std::string vectorEstimator;
|
|
158
|
+
std::string ttsJson;
|
|
159
|
+
std::string unicodeIndexer;
|
|
160
|
+
std::string voiceStyle;
|
|
153
161
|
};
|
|
154
162
|
|
|
155
163
|
struct SttDetectResult {
|
|
@@ -55,8 +55,18 @@ static const TtsFieldRequirement kZipvoiceReqs[] = {
|
|
|
55
55
|
{"decoder", &TtsModelPaths::decoder, true},
|
|
56
56
|
{"vocoder", &TtsModelPaths::vocoder, true},
|
|
57
57
|
{"tokens", &TtsModelPaths::tokens, true},
|
|
58
|
-
{"dataDir", &TtsModelPaths::dataDir,
|
|
59
|
-
{"lexicon", &TtsModelPaths::lexicon,
|
|
58
|
+
{"dataDir", &TtsModelPaths::dataDir, true},
|
|
59
|
+
{"lexicon", &TtsModelPaths::lexicon, true},
|
|
60
|
+
};
|
|
61
|
+
|
|
62
|
+
static const TtsFieldRequirement kSupertonicReqs[] = {
|
|
63
|
+
{"durationPredictor", &TtsModelPaths::durationPredictor, true},
|
|
64
|
+
{"textEncoder", &TtsModelPaths::textEncoder, true},
|
|
65
|
+
{"vectorEstimator", &TtsModelPaths::vectorEstimator, true},
|
|
66
|
+
{"vocoder", &TtsModelPaths::vocoder, true},
|
|
67
|
+
{"ttsJson", &TtsModelPaths::ttsJson, true},
|
|
68
|
+
{"unicodeIndexer", &TtsModelPaths::unicodeIndexer, true},
|
|
69
|
+
{"voiceStyle", &TtsModelPaths::voiceStyle, true},
|
|
60
70
|
};
|
|
61
71
|
|
|
62
72
|
// ============================================================
|
|
@@ -79,6 +89,9 @@ static const TtsFieldRequirement* GetRequirements(TtsModelKind kind, size_t& cou
|
|
|
79
89
|
case TtsModelKind::kZipvoice:
|
|
80
90
|
count = std::size(kZipvoiceReqs);
|
|
81
91
|
return kZipvoiceReqs;
|
|
92
|
+
case TtsModelKind::kSupertonic:
|
|
93
|
+
count = std::size(kSupertonicReqs);
|
|
94
|
+
return kSupertonicReqs;
|
|
82
95
|
default:
|
|
83
96
|
count = 0;
|
|
84
97
|
return nullptr;
|
|
@@ -93,6 +106,7 @@ static const char* TtsKindToName(TtsModelKind k) {
|
|
|
93
106
|
case TtsModelKind::kKitten: return "Kitten";
|
|
94
107
|
case TtsModelKind::kPocket: return "Pocket";
|
|
95
108
|
case TtsModelKind::kZipvoice: return "Zipvoice";
|
|
109
|
+
case TtsModelKind::kSupertonic: return "Supertonic";
|
|
96
110
|
default: return "Unknown";
|
|
97
111
|
}
|
|
98
112
|
}
|
|
@@ -102,6 +116,8 @@ static const char* GetFieldHint(const char* fieldName) {
|
|
|
102
116
|
return "Copy espeak-ng-data into the model directory.";
|
|
103
117
|
if (std::strcmp(fieldName, "tokens") == 0)
|
|
104
118
|
return "Ensure tokens.txt is present in the model directory.";
|
|
119
|
+
if (std::strcmp(fieldName, "lexicon") == 0)
|
|
120
|
+
return "Add lexicon.txt (or lexicon-<lang>.txt) from the official sherpa-onnx Zipvoice/Matcha release; without it the native engine aborts.";
|
|
105
121
|
return nullptr;
|
|
106
122
|
}
|
|
107
123
|
|
|
@@ -103,6 +103,7 @@ OnlineSttInitResult OnlineSttWrapper::initialize(
|
|
|
103
103
|
const std::string& provider,
|
|
104
104
|
const std::string& ruleFsts,
|
|
105
105
|
const std::string& ruleFars,
|
|
106
|
+
float dither,
|
|
106
107
|
float blankPenalty,
|
|
107
108
|
bool debug,
|
|
108
109
|
// NOTE: rule*MustContainNonSilence, rule1/2MinUtteranceLength, and
|
|
@@ -138,6 +139,9 @@ OnlineSttInitResult OnlineSttWrapper::initialize(
|
|
|
138
139
|
sherpa_onnx::cxx::OnlineRecognizerConfig config;
|
|
139
140
|
config.feat_config.sample_rate = 16000;
|
|
140
141
|
config.feat_config.feature_dim = 80;
|
|
142
|
+
// Dither is not exposed on cxx::FeatureConfig in the bundled sherpa-onnx headers;
|
|
143
|
+
// Android applies it via JNI. iOS uses the library default (no dither from JS).
|
|
144
|
+
(void)dither;
|
|
141
145
|
config.decoding_method = decodingMethod.empty() ? "greedy_search" : decodingMethod;
|
|
142
146
|
config.max_active_paths = maxActivePaths;
|
|
143
147
|
config.enable_endpoint = enableEndpoint;
|