react-native-sherpa-onnx 0.2.0 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +232 -236
- package/SherpaOnnx.podspec +68 -64
- package/android/build.gradle +182 -192
- package/android/codegen.gradle +57 -0
- package/android/prebuilt-download.gradle +428 -0
- package/android/prebuilt-versions.gradle +43 -0
- package/android/proguard-rules.pro +10 -0
- package/android/src/main/assets/testModels/add_mul_add.onnx +28 -0
- package/android/src/main/assets/testModels/nnapi_internal_uint8_support.onnx +0 -0
- package/android/src/main/assets/testModels/qnn_multi_ctx_embed.onnx +0 -0
- package/android/src/main/cpp/CMakeLists.txt +166 -129
- package/android/src/main/cpp/CMakePresets.json +54 -0
- package/android/src/main/cpp/crypto/sha256.cpp +174 -0
- package/android/src/main/cpp/crypto/sha256.h +16 -0
- package/android/src/main/cpp/jni/archive/sherpa-onnx-archive-helper.cpp +404 -0
- package/android/src/main/cpp/jni/archive/sherpa-onnx-archive-helper.h +56 -0
- package/android/src/main/cpp/jni/archive/sherpa-onnx-archive-jni.cpp +181 -0
- package/android/src/main/cpp/jni/audio/sherpa-onnx-audio-convert-jni.cpp +888 -0
- package/{ios → android/src/main/cpp/jni/model_detect}/sherpa-onnx-common.h +18 -18
- package/android/src/main/cpp/jni/model_detect/sherpa-onnx-detect-jni-common.cpp +86 -0
- package/android/src/main/cpp/jni/model_detect/sherpa-onnx-detect-jni-common.h +20 -0
- package/android/src/main/cpp/jni/model_detect/sherpa-onnx-model-detect-helper.cpp +423 -0
- package/android/src/main/cpp/jni/model_detect/sherpa-onnx-model-detect-helper.h +55 -0
- package/android/src/main/cpp/jni/model_detect/sherpa-onnx-model-detect-stt.cpp +399 -0
- package/android/src/main/cpp/jni/model_detect/sherpa-onnx-model-detect-tts.cpp +238 -0
- package/{ios → android/src/main/cpp/jni/model_detect}/sherpa-onnx-model-detect.h +122 -89
- package/android/src/main/cpp/jni/model_detect/sherpa-onnx-stt-wrapper.cpp +99 -0
- package/android/src/main/cpp/jni/model_detect/sherpa-onnx-stt-wrapper.h +16 -0
- package/android/src/main/cpp/jni/model_detect/sherpa-onnx-tts-wrapper.cpp +78 -0
- package/android/src/main/cpp/jni/model_detect/sherpa-onnx-tts-wrapper.h +16 -0
- package/android/src/main/cpp/jni/module/sherpa-onnx-module-jni.cpp +190 -0
- package/android/src/main/cpp/jni/tts/sherpa-onnx-tts-zipvoice-jni.cpp +301 -0
- package/android/src/main/java/com/sherpaonnx/SherpaOnnxArchiveHelper.kt +94 -0
- package/android/src/main/java/com/sherpaonnx/{SherpaOnnxCoreHelper.kt → SherpaOnnxAssetHelper.kt} +350 -236
- package/android/src/main/java/com/sherpaonnx/SherpaOnnxModule.kt +791 -483
- package/android/src/main/java/com/sherpaonnx/SherpaOnnxSttHelper.kt +699 -109
- package/android/src/main/java/com/sherpaonnx/SherpaOnnxTtsHelper.kt +1123 -668
- package/android/src/main/java/com/sherpaonnx/ZipvoiceTtsWrapper.kt +187 -0
- package/ios/SherpaOnnx+Assets.h +11 -0
- package/ios/SherpaOnnx+Assets.mm +325 -0
- package/ios/SherpaOnnx+STT.mm +455 -118
- package/ios/SherpaOnnx+TTS.mm +1101 -712
- package/ios/SherpaOnnx.h +17 -6
- package/ios/SherpaOnnx.mm +206 -311
- package/ios/SherpaOnnx.xcconfig +19 -19
- package/ios/SherpaOnnxCoreMLHelper.swift +24 -0
- package/ios/archive/sherpa-onnx-archive-helper.h +21 -0
- package/ios/archive/sherpa-onnx-archive-helper.mm +296 -0
- package/ios/libarchive_darwin_config.h +153 -0
- package/{android/src/main/cpp/jni → ios/model_detect}/sherpa-onnx-common.h +18 -18
- package/ios/model_detect/sherpa-onnx-model-detect-helper.h +49 -0
- package/ios/model_detect/sherpa-onnx-model-detect-helper.mm +210 -0
- package/ios/model_detect/sherpa-onnx-model-detect-stt.mm +344 -0
- package/ios/model_detect/sherpa-onnx-model-detect-tts.mm +201 -0
- package/{android/src/main/cpp/jni → ios/model_detect}/sherpa-onnx-model-detect.h +117 -89
- package/ios/scripts/patch-libarchive-includes.sh +61 -0
- package/ios/scripts/setup-ios-libarchive.sh +98 -0
- package/ios/stt/sherpa-onnx-stt-wrapper.h +129 -0
- package/ios/stt/sherpa-onnx-stt-wrapper.mm +523 -0
- package/ios/{sherpa-onnx-tts-wrapper.h → tts/sherpa-onnx-tts-wrapper.h} +90 -85
- package/ios/{sherpa-onnx-tts-wrapper.mm → tts/sherpa-onnx-tts-wrapper.mm} +376 -345
- package/lib/module/NativeSherpaOnnx.js +3 -0
- package/lib/module/NativeSherpaOnnx.js.map +1 -1
- package/lib/module/audio/index.js +22 -0
- package/lib/module/audio/index.js.map +1 -0
- package/lib/module/diarization/index.js +1 -1
- package/lib/module/diarization/index.js.map +1 -1
- package/lib/module/download/ModelDownloadManager.js +918 -0
- package/lib/module/download/ModelDownloadManager.js.map +1 -0
- package/lib/module/download/extractTarBz2.js +53 -0
- package/lib/module/download/extractTarBz2.js.map +1 -0
- package/lib/module/download/index.js +6 -0
- package/lib/module/download/index.js.map +1 -0
- package/lib/module/download/validation.js +178 -0
- package/lib/module/download/validation.js.map +1 -0
- package/lib/module/enhancement/index.js +1 -1
- package/lib/module/enhancement/index.js.map +1 -1
- package/lib/module/index.js +41 -3
- package/lib/module/index.js.map +1 -1
- package/lib/module/separation/index.js +1 -1
- package/lib/module/separation/index.js.map +1 -1
- package/lib/module/stt/index.js +127 -60
- package/lib/module/stt/index.js.map +1 -1
- package/lib/module/stt/sttModelLanguages.js +512 -0
- package/lib/module/stt/sttModelLanguages.js.map +1 -0
- package/lib/module/stt/types.js +53 -1
- package/lib/module/stt/types.js.map +1 -1
- package/lib/module/tts/index.js +216 -289
- package/lib/module/tts/index.js.map +1 -1
- package/lib/module/tts/types.js +86 -1
- package/lib/module/tts/types.js.map +1 -1
- package/lib/module/types.js.map +1 -1
- package/lib/module/utils.js +86 -73
- package/lib/module/utils.js.map +1 -1
- package/lib/module/vad/index.js +1 -1
- package/lib/module/vad/index.js.map +1 -1
- package/lib/typescript/src/NativeSherpaOnnx.d.ts +192 -38
- package/lib/typescript/src/NativeSherpaOnnx.d.ts.map +1 -1
- package/lib/typescript/src/audio/index.d.ts +13 -0
- package/lib/typescript/src/audio/index.d.ts.map +1 -0
- package/lib/typescript/src/diarization/index.d.ts +3 -2
- package/lib/typescript/src/diarization/index.d.ts.map +1 -1
- package/lib/typescript/src/download/ModelDownloadManager.d.ts +108 -0
- package/lib/typescript/src/download/ModelDownloadManager.d.ts.map +1 -0
- package/lib/typescript/src/download/extractTarBz2.d.ts +14 -0
- package/lib/typescript/src/download/extractTarBz2.d.ts.map +1 -0
- package/lib/typescript/src/download/index.d.ts +7 -0
- package/lib/typescript/src/download/index.d.ts.map +1 -0
- package/lib/typescript/src/download/validation.d.ts +57 -0
- package/lib/typescript/src/download/validation.d.ts.map +1 -0
- package/lib/typescript/src/enhancement/index.d.ts +3 -2
- package/lib/typescript/src/enhancement/index.d.ts.map +1 -1
- package/lib/typescript/src/index.d.ts +26 -2
- package/lib/typescript/src/index.d.ts.map +1 -1
- package/lib/typescript/src/separation/index.d.ts +3 -2
- package/lib/typescript/src/separation/index.d.ts.map +1 -1
- package/lib/typescript/src/stt/index.d.ts +31 -43
- package/lib/typescript/src/stt/index.d.ts.map +1 -1
- package/lib/typescript/src/stt/sttModelLanguages.d.ts +52 -0
- package/lib/typescript/src/stt/sttModelLanguages.d.ts.map +1 -0
- package/lib/typescript/src/stt/types.d.ts +196 -9
- package/lib/typescript/src/stt/types.d.ts.map +1 -1
- package/lib/typescript/src/tts/index.d.ts +25 -211
- package/lib/typescript/src/tts/index.d.ts.map +1 -1
- package/lib/typescript/src/tts/types.d.ts +148 -25
- package/lib/typescript/src/tts/types.d.ts.map +1 -1
- package/lib/typescript/src/types.d.ts +0 -32
- package/lib/typescript/src/types.d.ts.map +1 -1
- package/lib/typescript/src/utils.d.ts +28 -13
- package/lib/typescript/src/utils.d.ts.map +1 -1
- package/lib/typescript/src/vad/index.d.ts +3 -2
- package/lib/typescript/src/vad/index.d.ts.map +1 -1
- package/package.json +250 -222
- package/scripts/check-qnn-support.sh +78 -0
- package/scripts/setup-ios-framework.sh +379 -282
- package/src/NativeSherpaOnnx.ts +474 -251
- package/src/audio/index.ts +32 -0
- package/src/diarization/index.ts +4 -2
- package/src/download/ModelDownloadManager.ts +1325 -0
- package/src/download/extractTarBz2.ts +78 -0
- package/src/download/index.ts +43 -0
- package/src/download/validation.ts +279 -0
- package/src/enhancement/index.ts +4 -2
- package/src/index.tsx +78 -27
- package/src/separation/index.ts +4 -2
- package/src/stt/index.ts +249 -89
- package/src/stt/sttModelLanguages.ts +237 -0
- package/src/stt/types.ts +263 -9
- package/src/tts/index.ts +470 -458
- package/src/tts/types.ts +373 -218
- package/src/types.ts +0 -44
- package/src/utils.ts +145 -131
- package/src/vad/index.ts +4 -2
- package/third_party/ffmpeg_prebuilt/ANDROID_RELEASE_TAG +1 -0
- package/third_party/libarchive_prebuilt/ANDROID_RELEASE_TAG +1 -0
- package/third_party/libarchive_prebuilt/IOS_RELEASE_TAG +1 -0
- package/third_party/sherpa-onnx-prebuilt/ANDROID_RELEASE_TAG +1 -0
- package/third_party/sherpa-onnx-prebuilt/IOS_RELEASE_TAG +1 -0
- package/android/src/main/cpp/include/sherpa-onnx/c-api/c-api.h +0 -1918
- package/android/src/main/cpp/include/sherpa-onnx/c-api/cxx-api.h +0 -841
- package/android/src/main/cpp/jni/sherpa-onnx-model-detect.cpp +0 -541
- package/android/src/main/cpp/jni/sherpa-onnx-stt-jni.cpp +0 -336
- package/android/src/main/cpp/jni/sherpa-onnx-stt-wrapper.cpp +0 -222
- package/android/src/main/cpp/jni/sherpa-onnx-stt-wrapper.h +0 -68
- package/android/src/main/cpp/jni/sherpa-onnx-tts-jni.cpp +0 -823
- package/android/src/main/cpp/jni/sherpa-onnx-tts-wrapper.cpp +0 -387
- package/android/src/main/cpp/jni/sherpa-onnx-tts-wrapper.h +0 -147
- package/ios/Frameworks/sherpa_onnx.xcframework.zip +0 -0
- package/ios/include/sherpa-onnx/c-api/c-api.h +0 -1918
- package/ios/include/sherpa-onnx/c-api/cxx-api.h +0 -841
- package/ios/sherpa-onnx-model-detect.mm +0 -441
- package/ios/sherpa-onnx-stt-wrapper.h +0 -48
- package/ios/sherpa-onnx-stt-wrapper.mm +0 -201
- package/scripts/copy-headers.js +0 -184
- package/scripts/setup-assets.js +0 -323
|
@@ -0,0 +1,888 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* sherpa-onnx-audio-convert-jni.cpp
|
|
3
|
+
*
|
|
4
|
+
* Purpose: JNI for converting arbitrary audio files to WAV 16 kHz mono 16-bit PCM (sherpa-onnx
|
|
5
|
+
* input format). When HAVE_FFMPEG is set, FFmpeg is used; otherwise nativeConvertAudioToWav16k
|
|
6
|
+
* returns an error. Used by the Kotlin audio conversion API.
|
|
7
|
+
*/
|
|
8
|
+
#include <android/log.h>
|
|
9
|
+
#include <jni.h>
|
|
10
|
+
#include <string>
|
|
11
|
+
|
|
12
|
+
#define LOG_TAG "AudioConvertJNI"
|
|
13
|
+
#define LOGI(...) __android_log_print(ANDROID_LOG_INFO, LOG_TAG, __VA_ARGS__)
|
|
14
|
+
#define LOGW(...) __android_log_print(ANDROID_LOG_WARN, LOG_TAG, __VA_ARGS__)
|
|
15
|
+
#define LOGE(...) __android_log_print(ANDROID_LOG_ERROR, LOG_TAG, __VA_ARGS__)
|
|
16
|
+
|
|
17
|
+
#ifdef HAVE_FFMPEG
|
|
18
|
+
extern "C" {
|
|
19
|
+
#include <libavcodec/avcodec.h>
|
|
20
|
+
#include <libavformat/avformat.h>
|
|
21
|
+
#include <libavutil/opt.h>
|
|
22
|
+
#include <libswresample/swresample.h>
|
|
23
|
+
}
|
|
24
|
+
#include <cstdio>
|
|
25
|
+
#include <vector>
|
|
26
|
+
#endif
|
|
27
|
+
|
|
28
|
+
// Returns empty string on success, or error message on failure.
|
|
29
|
+
// Output is always 16 kHz mono 16-bit PCM (sherpa-onnx requirement). Input can be any rate; we resample to 16k.
|
|
30
|
+
static std::string convertToWav16kMono(const char* inputPath, const char* outputPath) {
|
|
31
|
+
#ifdef HAVE_FFMPEG
|
|
32
|
+
// Implement a basic decode -> resample -> write WAV pipeline using libav* APIs.
|
|
33
|
+
av_log_set_level(AV_LOG_ERROR);
|
|
34
|
+
|
|
35
|
+
AVFormatContext* inFmt = nullptr;
|
|
36
|
+
if (avformat_open_input(&inFmt, inputPath, nullptr, nullptr) < 0) {
|
|
37
|
+
return std::string("Failed to open input file");
|
|
38
|
+
}
|
|
39
|
+
if (avformat_find_stream_info(inFmt, nullptr) < 0) {
|
|
40
|
+
avformat_close_input(&inFmt);
|
|
41
|
+
return std::string("Failed to find stream info");
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
int audioStreamIndex = -1;
|
|
45
|
+
for (unsigned i = 0; i < inFmt->nb_streams; ++i) {
|
|
46
|
+
if (inFmt->streams[i]->codecpar->codec_type == AVMEDIA_TYPE_AUDIO) {
|
|
47
|
+
audioStreamIndex = i;
|
|
48
|
+
break;
|
|
49
|
+
}
|
|
50
|
+
}
|
|
51
|
+
if (audioStreamIndex < 0) {
|
|
52
|
+
avformat_close_input(&inFmt);
|
|
53
|
+
return std::string("No audio stream found in input");
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
AVStream* inStream = inFmt->streams[audioStreamIndex];
|
|
57
|
+
const AVCodec* decoder = avcodec_find_decoder(inStream->codecpar->codec_id);
|
|
58
|
+
if (!decoder) {
|
|
59
|
+
avformat_close_input(&inFmt);
|
|
60
|
+
return std::string("Unsupported input codec");
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
AVCodecContext* decCtx = avcodec_alloc_context3(decoder);
|
|
64
|
+
if (!decCtx) {
|
|
65
|
+
avformat_close_input(&inFmt);
|
|
66
|
+
return std::string("Failed to allocate decoder context");
|
|
67
|
+
}
|
|
68
|
+
if (avcodec_parameters_to_context(decCtx, inStream->codecpar) < 0) {
|
|
69
|
+
avcodec_free_context(&decCtx);
|
|
70
|
+
avformat_close_input(&inFmt);
|
|
71
|
+
return std::string("Failed to copy codec parameters");
|
|
72
|
+
}
|
|
73
|
+
if (avcodec_open2(decCtx, decoder, nullptr) < 0) {
|
|
74
|
+
avcodec_free_context(&decCtx);
|
|
75
|
+
avformat_close_input(&inFmt);
|
|
76
|
+
return std::string("Failed to open decoder");
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
// Prepare resampler to 16k mono s16 using AVChannelLayout helpers
|
|
80
|
+
SwrContext* swr = nullptr;
|
|
81
|
+
AVChannelLayout out_ch_layout = AV_CHANNEL_LAYOUT_MONO;
|
|
82
|
+
AVChannelLayout in_ch_layout;
|
|
83
|
+
// Prefer codecpar ch_layout when available, otherwise fall back to decoder ctx
|
|
84
|
+
if (inStream->codecpar->ch_layout.nb_channels) {
|
|
85
|
+
if (av_channel_layout_copy(&in_ch_layout, &inStream->codecpar->ch_layout) < 0) {
|
|
86
|
+
avcodec_free_context(&decCtx);
|
|
87
|
+
avformat_close_input(&inFmt);
|
|
88
|
+
return std::string("Failed to copy input channel layout");
|
|
89
|
+
}
|
|
90
|
+
} else {
|
|
91
|
+
if (av_channel_layout_copy(&in_ch_layout, &decCtx->ch_layout) < 0) {
|
|
92
|
+
avcodec_free_context(&decCtx);
|
|
93
|
+
avformat_close_input(&inFmt);
|
|
94
|
+
return std::string("Failed to initialize input channel layout");
|
|
95
|
+
}
|
|
96
|
+
}
|
|
97
|
+
if (swr_alloc_set_opts2(&swr,
|
|
98
|
+
&out_ch_layout, AV_SAMPLE_FMT_S16, 16000,
|
|
99
|
+
&in_ch_layout, (AVSampleFormat)decCtx->sample_fmt, decCtx->sample_rate,
|
|
100
|
+
0, nullptr) < 0 || !swr) {
|
|
101
|
+
av_channel_layout_uninit(&in_ch_layout);
|
|
102
|
+
if (swr) swr_free(&swr);
|
|
103
|
+
avcodec_free_context(&decCtx);
|
|
104
|
+
avformat_close_input(&inFmt);
|
|
105
|
+
return std::string("Failed to initialize resampler");
|
|
106
|
+
}
|
|
107
|
+
av_channel_layout_uninit(&in_ch_layout);
|
|
108
|
+
|
|
109
|
+
// Prepare output WAV via avformat
|
|
110
|
+
AVFormatContext* outFmt = nullptr;
|
|
111
|
+
if (avformat_alloc_output_context2(&outFmt, nullptr, nullptr, outputPath) < 0 || !outFmt) {
|
|
112
|
+
swr_free(&swr);
|
|
113
|
+
avcodec_free_context(&decCtx);
|
|
114
|
+
avformat_close_input(&inFmt);
|
|
115
|
+
return std::string("Failed to allocate output context");
|
|
116
|
+
}
|
|
117
|
+
|
|
118
|
+
const AVCodec* pcmCodec = avcodec_find_encoder(AV_CODEC_ID_PCM_S16LE);
|
|
119
|
+
if (!pcmCodec) {
|
|
120
|
+
avformat_free_context(outFmt);
|
|
121
|
+
swr_free(&swr);
|
|
122
|
+
avcodec_free_context(&decCtx);
|
|
123
|
+
avformat_close_input(&inFmt);
|
|
124
|
+
return std::string("PCM encoder not found");
|
|
125
|
+
}
|
|
126
|
+
|
|
127
|
+
AVStream* outStream = avformat_new_stream(outFmt, nullptr);
|
|
128
|
+
if (!outStream) {
|
|
129
|
+
avformat_free_context(outFmt);
|
|
130
|
+
swr_free(&swr);
|
|
131
|
+
avcodec_free_context(&decCtx);
|
|
132
|
+
avformat_close_input(&inFmt);
|
|
133
|
+
return std::string("Failed to create output stream");
|
|
134
|
+
}
|
|
135
|
+
|
|
136
|
+
AVCodecContext* encCtx = avcodec_alloc_context3(pcmCodec);
|
|
137
|
+
// Configure encoder context for mono 16k s16 output
|
|
138
|
+
AVChannelLayout mono_layout = AV_CHANNEL_LAYOUT_MONO;
|
|
139
|
+
if (!encCtx) {
|
|
140
|
+
avformat_free_context(outFmt);
|
|
141
|
+
swr_free(&swr);
|
|
142
|
+
avcodec_free_context(&decCtx);
|
|
143
|
+
avformat_close_input(&inFmt);
|
|
144
|
+
return std::string("Failed to allocate encoder context");
|
|
145
|
+
}
|
|
146
|
+
if (av_channel_layout_copy(&encCtx->ch_layout, &mono_layout) < 0) {
|
|
147
|
+
avcodec_free_context(&encCtx);
|
|
148
|
+
avformat_free_context(outFmt);
|
|
149
|
+
swr_free(&swr);
|
|
150
|
+
avcodec_free_context(&decCtx);
|
|
151
|
+
avformat_close_input(&inFmt);
|
|
152
|
+
return std::string("Failed to set encoder channel layout");
|
|
153
|
+
}
|
|
154
|
+
encCtx->sample_rate = 16000;
|
|
155
|
+
encCtx->sample_fmt = AV_SAMPLE_FMT_S16;
|
|
156
|
+
encCtx->bit_rate = 16 * 16000; // rough
|
|
157
|
+
|
|
158
|
+
if (outFmt->oformat->flags & AVFMT_GLOBALHEADER) encCtx->flags |= AV_CODEC_FLAG_GLOBAL_HEADER;
|
|
159
|
+
|
|
160
|
+
if (avcodec_open2(encCtx, pcmCodec, nullptr) < 0) {
|
|
161
|
+
avcodec_free_context(&encCtx);
|
|
162
|
+
avformat_free_context(outFmt);
|
|
163
|
+
swr_free(&swr);
|
|
164
|
+
avcodec_free_context(&decCtx);
|
|
165
|
+
avformat_close_input(&inFmt);
|
|
166
|
+
return std::string("Failed to open PCM encoder");
|
|
167
|
+
}
|
|
168
|
+
|
|
169
|
+
if (avcodec_parameters_from_context(outStream->codecpar, encCtx) < 0) {
|
|
170
|
+
avcodec_free_context(&encCtx);
|
|
171
|
+
avformat_free_context(outFmt);
|
|
172
|
+
swr_free(&swr);
|
|
173
|
+
avcodec_free_context(&decCtx);
|
|
174
|
+
avformat_close_input(&inFmt);
|
|
175
|
+
return std::string("Failed to set output stream parameters");
|
|
176
|
+
}
|
|
177
|
+
|
|
178
|
+
if (!(outFmt->oformat->flags & AVFMT_NOFILE)) {
|
|
179
|
+
if (avio_open(&outFmt->pb, outputPath, AVIO_FLAG_WRITE) < 0) {
|
|
180
|
+
avcodec_free_context(&encCtx);
|
|
181
|
+
avformat_free_context(outFmt);
|
|
182
|
+
swr_free(&swr);
|
|
183
|
+
avcodec_free_context(&decCtx);
|
|
184
|
+
avformat_close_input(&inFmt);
|
|
185
|
+
return std::string("Failed to open output file for writing");
|
|
186
|
+
}
|
|
187
|
+
}
|
|
188
|
+
|
|
189
|
+
if (avformat_write_header(outFmt, nullptr) < 0) {
|
|
190
|
+
if (!(outFmt->oformat->flags & AVFMT_NOFILE)) avio_closep(&outFmt->pb);
|
|
191
|
+
avcodec_free_context(&encCtx);
|
|
192
|
+
avformat_free_context(outFmt);
|
|
193
|
+
swr_free(&swr);
|
|
194
|
+
avcodec_free_context(&decCtx);
|
|
195
|
+
avformat_close_input(&inFmt);
|
|
196
|
+
return std::string("Failed to write output header");
|
|
197
|
+
}
|
|
198
|
+
|
|
199
|
+
AVPacket* pkt = av_packet_alloc();
|
|
200
|
+
AVFrame* frame = av_frame_alloc();
|
|
201
|
+
AVFrame* resampled = av_frame_alloc();
|
|
202
|
+
// Configure resampled frame metadata
|
|
203
|
+
resampled->format = AV_SAMPLE_FMT_S16;
|
|
204
|
+
resampled->sample_rate = 16000;
|
|
205
|
+
// set channel layout on frame
|
|
206
|
+
AVChannelLayout out_ch_layout_local = AV_CHANNEL_LAYOUT_MONO;
|
|
207
|
+
if (av_channel_layout_copy(&resampled->ch_layout, &out_ch_layout_local) < 0) {
|
|
208
|
+
av_frame_free(&frame);
|
|
209
|
+
av_frame_free(&resampled);
|
|
210
|
+
swr_free(&swr);
|
|
211
|
+
avcodec_free_context(&decCtx);
|
|
212
|
+
avformat_close_input(&inFmt);
|
|
213
|
+
return std::string("Failed to set resampled frame channel layout");
|
|
214
|
+
}
|
|
215
|
+
|
|
216
|
+
// Buffer for resampled data will be allocated per needed samples
|
|
217
|
+
|
|
218
|
+
while (av_read_frame(inFmt, pkt) >= 0) {
|
|
219
|
+
if (pkt->stream_index == audioStreamIndex) {
|
|
220
|
+
if (avcodec_send_packet(decCtx, pkt) == 0) {
|
|
221
|
+
while (avcodec_receive_frame(decCtx, frame) == 0) {
|
|
222
|
+
// Resample
|
|
223
|
+
int in_sr = inStream->codecpar->sample_rate ? inStream->codecpar->sample_rate : decCtx->sample_rate;
|
|
224
|
+
int64_t out_nb_samples = av_rescale_rnd(swr_get_delay(swr, in_sr) + frame->nb_samples, 16000, in_sr, AV_ROUND_UP);
|
|
225
|
+
uint8_t** outData = nullptr;
|
|
226
|
+
int out_channels = resampled->ch_layout.nb_channels;
|
|
227
|
+
if (out_channels <= 0) out_channels = 1;
|
|
228
|
+
if (av_samples_alloc_array_and_samples(&outData, nullptr, out_channels, (int)out_nb_samples, AV_SAMPLE_FMT_S16, 0) < 0) {
|
|
229
|
+
av_packet_unref(pkt);
|
|
230
|
+
continue;
|
|
231
|
+
}
|
|
232
|
+
int converted = swr_convert(swr, outData, (int)out_nb_samples, (const uint8_t**)frame->data, frame->nb_samples);
|
|
233
|
+
if (converted < 0) {
|
|
234
|
+
av_freep(&outData[0]);
|
|
235
|
+
av_freep(&outData);
|
|
236
|
+
continue;
|
|
237
|
+
}
|
|
238
|
+
|
|
239
|
+
// prepare frame for encoder
|
|
240
|
+
resampled->nb_samples = converted;
|
|
241
|
+
if (av_frame_get_buffer(resampled, 0) < 0) {
|
|
242
|
+
av_freep(&outData[0]);
|
|
243
|
+
av_freep(&outData);
|
|
244
|
+
continue;
|
|
245
|
+
}
|
|
246
|
+
// copy data into resampled frame
|
|
247
|
+
int bytes_per_sample = av_get_bytes_per_sample((AVSampleFormat)resampled->format);
|
|
248
|
+
int copy_size = converted * bytes_per_sample * out_channels;
|
|
249
|
+
memcpy(resampled->data[0], outData[0], copy_size);
|
|
250
|
+
|
|
251
|
+
// send to encoder
|
|
252
|
+
if (avcodec_send_frame(encCtx, resampled) == 0) {
|
|
253
|
+
AVPacket* outPkt = av_packet_alloc();
|
|
254
|
+
while (avcodec_receive_packet(encCtx, outPkt) == 0) {
|
|
255
|
+
outPkt->stream_index = outStream->index;
|
|
256
|
+
av_packet_rescale_ts(outPkt, encCtx->time_base, outStream->time_base);
|
|
257
|
+
av_interleaved_write_frame(outFmt, outPkt);
|
|
258
|
+
av_packet_unref(outPkt);
|
|
259
|
+
}
|
|
260
|
+
av_packet_free(&outPkt);
|
|
261
|
+
}
|
|
262
|
+
|
|
263
|
+
av_freep(&outData[0]);
|
|
264
|
+
av_freep(&outData);
|
|
265
|
+
av_frame_unref(resampled);
|
|
266
|
+
av_frame_unref(frame);
|
|
267
|
+
}
|
|
268
|
+
}
|
|
269
|
+
}
|
|
270
|
+
av_packet_unref(pkt);
|
|
271
|
+
}
|
|
272
|
+
|
|
273
|
+
// Flush encoder
|
|
274
|
+
avcodec_send_frame(encCtx, nullptr);
|
|
275
|
+
AVPacket* outPkt = av_packet_alloc();
|
|
276
|
+
while (avcodec_receive_packet(encCtx, outPkt) == 0) {
|
|
277
|
+
outPkt->stream_index = outStream->index;
|
|
278
|
+
av_packet_rescale_ts(outPkt, encCtx->time_base, outStream->time_base);
|
|
279
|
+
av_interleaved_write_frame(outFmt, outPkt);
|
|
280
|
+
av_packet_unref(outPkt);
|
|
281
|
+
}
|
|
282
|
+
av_packet_free(&outPkt);
|
|
283
|
+
|
|
284
|
+
av_write_trailer(outFmt);
|
|
285
|
+
if (!(outFmt->oformat->flags & AVFMT_NOFILE)) avio_closep(&outFmt->pb);
|
|
286
|
+
|
|
287
|
+
av_packet_free(&pkt);
|
|
288
|
+
av_frame_free(&frame);
|
|
289
|
+
av_channel_layout_uninit(&resampled->ch_layout);
|
|
290
|
+
av_frame_free(&resampled);
|
|
291
|
+
|
|
292
|
+
swr_free(&swr);
|
|
293
|
+
avcodec_free_context(&encCtx);
|
|
294
|
+
avformat_free_context(outFmt);
|
|
295
|
+
avcodec_free_context(&decCtx);
|
|
296
|
+
avformat_close_input(&inFmt);
|
|
297
|
+
|
|
298
|
+
return std::string("");
|
|
299
|
+
#else
|
|
300
|
+
(void)inputPath;
|
|
301
|
+
(void)outputPath;
|
|
302
|
+
return "FFmpeg not available. Build prebuilts with third_party/ffmpeg_prebuilt/build_ffmpeg.ps1 or build_ffmpeg.sh.";
|
|
303
|
+
#endif
|
|
304
|
+
}
|
|
305
|
+
|
|
306
|
+
// Generic conversion: supports writing WAV/MP3/FLAC depending on output file extension and linked encoders.
|
|
307
|
+
// WAV path always uses convertToWav16kMono (16 kHz mono out for sherpa-onnx). outputSampleRateHz is only used for MP3 (libshine: 32000/44100/48000); 0 = default 44100.
|
|
308
|
+
static std::string convertToFormat(const char* inputPath, const char* outputPath, const char* formatHint, int outputSampleRateHz) {
|
|
309
|
+
#ifdef HAVE_FFMPEG
|
|
310
|
+
// WAV output is always 16 kHz mono via convertToWav16kMono (sherpa-onnx). Input WAV at 16k is resampled 16k->16k (no change).
|
|
311
|
+
std::string fmt(formatHint ? formatHint : "");
|
|
312
|
+
if (fmt == "wav" || fmt == "wav16k") {
|
|
313
|
+
return convertToWav16kMono(inputPath, outputPath);
|
|
314
|
+
}
|
|
315
|
+
|
|
316
|
+
// Try to determine codec id from format hint
|
|
317
|
+
AVCodecID codec_id = AV_CODEC_ID_NONE;
|
|
318
|
+
if (fmt == "mp3") codec_id = AV_CODEC_ID_MP3;
|
|
319
|
+
else if (fmt == "flac") codec_id = AV_CODEC_ID_FLAC;
|
|
320
|
+
else {
|
|
321
|
+
// fallback to WAV
|
|
322
|
+
return convertToWav16kMono(inputPath, outputPath);
|
|
323
|
+
}
|
|
324
|
+
|
|
325
|
+
// The implementation for generic encoding uses the same decode+resample pipeline
|
|
326
|
+
// but selects encoder by codec_id and creates an output container based on file extension.
|
|
327
|
+
// For brevity we reuse much of the WAV path but change encoder selection.
|
|
328
|
+
|
|
329
|
+
// Open input
|
|
330
|
+
AVFormatContext* inFmt = nullptr;
|
|
331
|
+
if (avformat_open_input(&inFmt, inputPath, nullptr, nullptr) < 0) {
|
|
332
|
+
return std::string("Failed to open input file");
|
|
333
|
+
}
|
|
334
|
+
if (avformat_find_stream_info(inFmt, nullptr) < 0) {
|
|
335
|
+
avformat_close_input(&inFmt);
|
|
336
|
+
return std::string("Failed to find stream info");
|
|
337
|
+
}
|
|
338
|
+
|
|
339
|
+
int audioStreamIndex = -1;
|
|
340
|
+
for (unsigned i = 0; i < inFmt->nb_streams; ++i) {
|
|
341
|
+
if (inFmt->streams[i]->codecpar->codec_type == AVMEDIA_TYPE_AUDIO) {
|
|
342
|
+
audioStreamIndex = i;
|
|
343
|
+
break;
|
|
344
|
+
}
|
|
345
|
+
}
|
|
346
|
+
if (audioStreamIndex < 0) {
|
|
347
|
+
avformat_close_input(&inFmt);
|
|
348
|
+
return std::string("No audio stream found in input");
|
|
349
|
+
}
|
|
350
|
+
|
|
351
|
+
AVStream* inStream = inFmt->streams[audioStreamIndex];
|
|
352
|
+
const AVCodec* decoder = avcodec_find_decoder(inStream->codecpar->codec_id);
|
|
353
|
+
if (!decoder) {
|
|
354
|
+
avformat_close_input(&inFmt);
|
|
355
|
+
return std::string("Unsupported input codec");
|
|
356
|
+
}
|
|
357
|
+
|
|
358
|
+
AVCodecContext* decCtx = avcodec_alloc_context3(decoder);
|
|
359
|
+
if (!decCtx) {
|
|
360
|
+
avformat_close_input(&inFmt);
|
|
361
|
+
return std::string("Failed to allocate decoder context");
|
|
362
|
+
}
|
|
363
|
+
if (avcodec_parameters_to_context(decCtx, inStream->codecpar) < 0) {
|
|
364
|
+
avcodec_free_context(&decCtx);
|
|
365
|
+
avformat_close_input(&inFmt);
|
|
366
|
+
return std::string("Failed to copy codec parameters");
|
|
367
|
+
}
|
|
368
|
+
if (avcodec_open2(decCtx, decoder, nullptr) < 0) {
|
|
369
|
+
avcodec_free_context(&decCtx);
|
|
370
|
+
avformat_close_input(&inFmt);
|
|
371
|
+
return std::string("Failed to open decoder");
|
|
372
|
+
}
|
|
373
|
+
|
|
374
|
+
// We'll configure resampler later based on encoder requirements.
|
|
375
|
+
SwrContext* swr = nullptr;
|
|
376
|
+
|
|
377
|
+
AVFormatContext* outFmt = nullptr;
|
|
378
|
+
if (avformat_alloc_output_context2(&outFmt, nullptr, nullptr, outputPath) < 0 || !outFmt) {
|
|
379
|
+
swr_free(&swr);
|
|
380
|
+
avcodec_free_context(&decCtx);
|
|
381
|
+
avformat_close_input(&inFmt);
|
|
382
|
+
return std::string("Failed to allocate output context");
|
|
383
|
+
}
|
|
384
|
+
|
|
385
|
+
const AVCodec* encoder = nullptr;
|
|
386
|
+
if (codec_id == AV_CODEC_ID_MP3) {
|
|
387
|
+
// Force using libshine for MP3 encoding. Do NOT fall back to libmp3lame or
|
|
388
|
+
// internal ffmpeg MP3 encoder to respect licensing choice.
|
|
389
|
+
encoder = avcodec_find_encoder_by_name("libshine");
|
|
390
|
+
if (!encoder) {
|
|
391
|
+
avformat_free_context(outFmt);
|
|
392
|
+
swr_free(&swr);
|
|
393
|
+
avcodec_free_context(&decCtx);
|
|
394
|
+
avformat_close_input(&inFmt);
|
|
395
|
+
return std::string("libshine encoder not available in this build");
|
|
396
|
+
}
|
|
397
|
+
} else {
|
|
398
|
+
encoder = avcodec_find_encoder(codec_id);
|
|
399
|
+
if (!encoder) {
|
|
400
|
+
avformat_free_context(outFmt);
|
|
401
|
+
swr_free(&swr);
|
|
402
|
+
avcodec_free_context(&decCtx);
|
|
403
|
+
avformat_close_input(&inFmt);
|
|
404
|
+
return std::string("Requested encoder not available in this build");
|
|
405
|
+
}
|
|
406
|
+
}
|
|
407
|
+
|
|
408
|
+
AVStream* outStream = avformat_new_stream(outFmt, nullptr);
|
|
409
|
+
if (!outStream) {
|
|
410
|
+
avformat_free_context(outFmt);
|
|
411
|
+
swr_free(&swr);
|
|
412
|
+
avcodec_free_context(&decCtx);
|
|
413
|
+
avformat_close_input(&inFmt);
|
|
414
|
+
return std::string("Failed to create output stream");
|
|
415
|
+
}
|
|
416
|
+
|
|
417
|
+
AVCodecContext* encCtx = avcodec_alloc_context3(encoder);
|
|
418
|
+
// Preserve input sample rate / channel layout by default
|
|
419
|
+
if (!encCtx) {
|
|
420
|
+
avformat_free_context(outFmt);
|
|
421
|
+
swr_free(&swr);
|
|
422
|
+
avcodec_free_context(&decCtx);
|
|
423
|
+
avformat_close_input(&inFmt);
|
|
424
|
+
return std::string("Failed to allocate encoder context");
|
|
425
|
+
}
|
|
426
|
+
// Set channel layout: prefer input stream layout, otherwise decoder layout.
|
|
427
|
+
if (inStream->codecpar->ch_layout.nb_channels) {
|
|
428
|
+
if (av_channel_layout_copy(&encCtx->ch_layout, &inStream->codecpar->ch_layout) < 0) {
|
|
429
|
+
avcodec_free_context(&encCtx);
|
|
430
|
+
avformat_free_context(outFmt);
|
|
431
|
+
swr_free(&swr);
|
|
432
|
+
avcodec_free_context(&decCtx);
|
|
433
|
+
avformat_close_input(&inFmt);
|
|
434
|
+
return std::string("Failed to copy input channel layout to encoder");
|
|
435
|
+
}
|
|
436
|
+
} else {
|
|
437
|
+
if (av_channel_layout_copy(&encCtx->ch_layout, &decCtx->ch_layout) < 0) {
|
|
438
|
+
avcodec_free_context(&encCtx);
|
|
439
|
+
avformat_free_context(outFmt);
|
|
440
|
+
swr_free(&swr);
|
|
441
|
+
avcodec_free_context(&decCtx);
|
|
442
|
+
avformat_close_input(&inFmt);
|
|
443
|
+
return std::string("Failed to set encoder channel layout");
|
|
444
|
+
}
|
|
445
|
+
}
|
|
446
|
+
|
|
447
|
+
// If using libshine (MP3), ensure channel_layout is explicitly set (old encoders expect it)
|
|
448
|
+
if (codec_id == AV_CODEC_ID_MP3) {
|
|
449
|
+
// If encCtx->ch_layout appears empty, set default based on input stream channels
|
|
450
|
+
if (encCtx->ch_layout.nb_channels <= 0) {
|
|
451
|
+
int nb_channels = 1;
|
|
452
|
+
if (inStream->codecpar && inStream->codecpar->ch_layout.nb_channels > 0) {
|
|
453
|
+
nb_channels = inStream->codecpar->ch_layout.nb_channels;
|
|
454
|
+
} else if (decCtx && decCtx->ch_layout.nb_channels > 0) {
|
|
455
|
+
nb_channels = decCtx->ch_layout.nb_channels;
|
|
456
|
+
}
|
|
457
|
+
av_channel_layout_default(&encCtx->ch_layout, nb_channels);
|
|
458
|
+
}
|
|
459
|
+
}
|
|
460
|
+
|
|
461
|
+
// Set sample rate from input/decoder if not already set
|
|
462
|
+
encCtx->sample_rate = inStream->codecpar->sample_rate ? inStream->codecpar->sample_rate : decCtx->sample_rate;
|
|
463
|
+
|
|
464
|
+
// Probe encoder-supported configurations (sample formats, sample rates, channel layouts)
|
|
465
|
+
AVSampleFormat chosen_fmt = AV_SAMPLE_FMT_NONE;
|
|
466
|
+
const void *fmt_configs = nullptr;
|
|
467
|
+
int fmt_num = 0;
|
|
468
|
+
avcodec_get_supported_config(encCtx, encoder, AV_CODEC_CONFIG_SAMPLE_FORMAT, 0, &fmt_configs, &fmt_num);
|
|
469
|
+
|
|
470
|
+
const void *sr_configs = nullptr;
|
|
471
|
+
int sr_num = 0;
|
|
472
|
+
avcodec_get_supported_config(encCtx, encoder, AV_CODEC_CONFIG_SAMPLE_RATE, 0, &sr_configs, &sr_num);
|
|
473
|
+
|
|
474
|
+
const void *chl_configs = nullptr;
|
|
475
|
+
int chl_num = 0;
|
|
476
|
+
avcodec_get_supported_config(encCtx, encoder, AV_CODEC_CONFIG_CHANNEL_LAYOUT, 0, &chl_configs, &chl_num);
|
|
477
|
+
|
|
478
|
+
// Log supported sample formats
|
|
479
|
+
if (fmt_configs && fmt_num > 0) {
|
|
480
|
+
const AVSampleFormat *fmts = (const AVSampleFormat *)fmt_configs;
|
|
481
|
+
for (int i = 0; i < fmt_num; ++i) {
|
|
482
|
+
const char *name = av_get_sample_fmt_name(fmts[i]);
|
|
483
|
+
LOGI("encoder supported fmt[%d]=%s", i, name ? name : "?");
|
|
484
|
+
}
|
|
485
|
+
// prefer interleaved S16, then planar S16P, then decoder fmt, then first
|
|
486
|
+
for (int i = 0; i < fmt_num; ++i) if (fmts[i] == AV_SAMPLE_FMT_S16) { chosen_fmt = AV_SAMPLE_FMT_S16; break; }
|
|
487
|
+
if (chosen_fmt == AV_SAMPLE_FMT_NONE && codec_id == AV_CODEC_ID_MP3) {
|
|
488
|
+
for (int i = 0; i < fmt_num; ++i) if (fmts[i] == AV_SAMPLE_FMT_S16P) { chosen_fmt = AV_SAMPLE_FMT_S16P; break; }
|
|
489
|
+
}
|
|
490
|
+
if (chosen_fmt == AV_SAMPLE_FMT_NONE) {
|
|
491
|
+
for (int i = 0; i < fmt_num; ++i) if (fmts[i] == decCtx->sample_fmt) { chosen_fmt = decCtx->sample_fmt; break; }
|
|
492
|
+
}
|
|
493
|
+
if (chosen_fmt == AV_SAMPLE_FMT_NONE && fmt_num > 0) chosen_fmt = fmts[0];
|
|
494
|
+
} else {
|
|
495
|
+
// libshine only supports S16P; default to S16P for MP3 so open succeeds
|
|
496
|
+
chosen_fmt = (codec_id == AV_CODEC_ID_MP3) ? AV_SAMPLE_FMT_S16P : AV_SAMPLE_FMT_S16;
|
|
497
|
+
}
|
|
498
|
+
encCtx->sample_fmt = chosen_fmt;
|
|
499
|
+
|
|
500
|
+
// If supported sample rates are provided, pick one matching our target or fall back
|
|
501
|
+
if (sr_configs && sr_num > 0) {
|
|
502
|
+
const int *srs = (const int*)sr_configs;
|
|
503
|
+
int pick_sr = 0;
|
|
504
|
+
for (int i = 0; i < sr_num; ++i) {
|
|
505
|
+
LOGI("encoder supported sample_rate[%d]=%d", i, srs[i]);
|
|
506
|
+
if (srs[i] == encCtx->sample_rate) { pick_sr = srs[i]; break; }
|
|
507
|
+
}
|
|
508
|
+
if (pick_sr == 0) pick_sr = srs[0];
|
|
509
|
+
encCtx->sample_rate = pick_sr;
|
|
510
|
+
}
|
|
511
|
+
// libshine only supports 32000, 44100, 48000 Hz. Use outputSampleRateHz if valid (32000/44100/48000), else default 44100.
|
|
512
|
+
if (codec_id == AV_CODEC_ID_MP3) {
|
|
513
|
+
int want = (outputSampleRateHz == 32000 || outputSampleRateHz == 44100 || outputSampleRateHz == 48000) ? outputSampleRateHz : 44100;
|
|
514
|
+
if (encCtx->sample_rate != want) {
|
|
515
|
+
LOGI("libshine: setting sample_rate %d (requested %d)", want, outputSampleRateHz);
|
|
516
|
+
encCtx->sample_rate = want;
|
|
517
|
+
}
|
|
518
|
+
}
|
|
519
|
+
|
|
520
|
+
// If supported channel layouts given, prefer matching channels else pick first
|
|
521
|
+
if (chl_configs && chl_num > 0) {
|
|
522
|
+
const AVChannelLayout *layouts = (const AVChannelLayout *)chl_configs;
|
|
523
|
+
int pick_nb = 0;
|
|
524
|
+
for (int i = 0; i < chl_num; ++i) {
|
|
525
|
+
const AVChannelLayout *l = &layouts[i];
|
|
526
|
+
char buf[128];
|
|
527
|
+
av_channel_layout_describe(l, buf, sizeof(buf));
|
|
528
|
+
LOGI("encoder supported ch_layout[%d]=%s nb_channels=%d", i, buf, l->nb_channels);
|
|
529
|
+
if (l->nb_channels == encCtx->ch_layout.nb_channels) { pick_nb = l->nb_channels; break; }
|
|
530
|
+
}
|
|
531
|
+
if (pick_nb == 0) pick_nb = layouts[0].nb_channels > 0 ? layouts[0].nb_channels : 1;
|
|
532
|
+
if (encCtx->ch_layout.nb_channels != pick_nb) av_channel_layout_default(&encCtx->ch_layout, pick_nb);
|
|
533
|
+
}
|
|
534
|
+
|
|
535
|
+
// libshine reads only AVCodecContext (not options). Use a well-known channel layout so nb_channels is always valid.
|
|
536
|
+
if (codec_id == AV_CODEC_ID_MP3) {
|
|
537
|
+
int want_ch = (encCtx->ch_layout.nb_channels == 2) ? 2 : 1;
|
|
538
|
+
av_channel_layout_uninit(&encCtx->ch_layout);
|
|
539
|
+
if (want_ch == 2) {
|
|
540
|
+
AVChannelLayout stereo = AV_CHANNEL_LAYOUT_STEREO;
|
|
541
|
+
if (av_channel_layout_copy(&encCtx->ch_layout, &stereo) < 0)
|
|
542
|
+
av_channel_layout_default(&encCtx->ch_layout, 2);
|
|
543
|
+
} else {
|
|
544
|
+
AVChannelLayout mono = AV_CHANNEL_LAYOUT_MONO;
|
|
545
|
+
if (av_channel_layout_copy(&encCtx->ch_layout, &mono) < 0)
|
|
546
|
+
av_channel_layout_default(&encCtx->ch_layout, 1);
|
|
547
|
+
}
|
|
548
|
+
}
|
|
549
|
+
|
|
550
|
+
// Set a sensible default bitrate for compressed codecs
|
|
551
|
+
if (codec_id == AV_CODEC_ID_MP3 || codec_id == AV_CODEC_ID_AAC) encCtx->bit_rate = 128000;
|
|
552
|
+
else encCtx->bit_rate = 0; // lossless or PCM may ignore
|
|
553
|
+
|
|
554
|
+
if (outFmt->oformat->flags & AVFMT_GLOBALHEADER) encCtx->flags |= AV_CODEC_FLAG_GLOBAL_HEADER;
|
|
555
|
+
|
|
556
|
+
// Ensure sensible timebase and try opening encoder with options. If it fails, iterate supported sample formats and retry.
|
|
557
|
+
if (encCtx->sample_rate > 0) encCtx->time_base = AVRational{1, encCtx->sample_rate};
|
|
558
|
+
|
|
559
|
+
AVDictionary *enc_opts = nullptr;
|
|
560
|
+
int nb_ch = encCtx->ch_layout.nb_channels;
|
|
561
|
+
if (nb_ch <= 0) nb_ch = 1;
|
|
562
|
+
char tmpbuf[64];
|
|
563
|
+
// For libshine, do not pass options — it uses only AVCodecContext; options can cause "Invalid argument".
|
|
564
|
+
if (codec_id != AV_CODEC_ID_MP3) {
|
|
565
|
+
snprintf(tmpbuf, sizeof(tmpbuf), "%d", nb_ch);
|
|
566
|
+
av_dict_set(&enc_opts, "channels", tmpbuf, 0);
|
|
567
|
+
snprintf(tmpbuf, sizeof(tmpbuf), "%d", encCtx->sample_rate);
|
|
568
|
+
av_dict_set(&enc_opts, "sample_rate", tmpbuf, 0);
|
|
569
|
+
if (encCtx->bit_rate > 0) {
|
|
570
|
+
snprintf(tmpbuf, sizeof(tmpbuf), "%d", (int)encCtx->bit_rate);
|
|
571
|
+
av_dict_set(&enc_opts, "bit_rate", tmpbuf, 0);
|
|
572
|
+
}
|
|
573
|
+
}
|
|
574
|
+
|
|
575
|
+
int ret = avcodec_open2(encCtx, encoder, &enc_opts);
|
|
576
|
+
if (ret < 0) {
|
|
577
|
+
char errbuf[256];
|
|
578
|
+
av_strerror(ret, errbuf, sizeof(errbuf));
|
|
579
|
+
if (enc_opts) { av_dict_free(&enc_opts); enc_opts = nullptr; }
|
|
580
|
+
|
|
581
|
+
// libshine (MP3): we already set S16P, valid rate, mono/stereo; no useful fallback.
|
|
582
|
+
if (codec_id == AV_CODEC_ID_MP3) {
|
|
583
|
+
std::string msg = std::string("Failed to open encoder: ") + errbuf;
|
|
584
|
+
avcodec_free_context(&encCtx);
|
|
585
|
+
avformat_free_context(outFmt);
|
|
586
|
+
swr_free(&swr);
|
|
587
|
+
avcodec_free_context(&decCtx);
|
|
588
|
+
avformat_close_input(&inFmt);
|
|
589
|
+
return msg;
|
|
590
|
+
}
|
|
591
|
+
|
|
592
|
+
LOGW("avcodec_open2 failed for encoder %s: %s. Trying alternatives.", encoder->name, errbuf);
|
|
593
|
+
|
|
594
|
+
// Try each supported sample format (for non-MP3 encoders that may accept multiple formats)
|
|
595
|
+
const AVSampleFormat *fmts = fmt_configs ? (const AVSampleFormat*)fmt_configs : nullptr;
|
|
596
|
+
if (fmts && fmt_num > 0) {
|
|
597
|
+
for (int i = 0; i < fmt_num && ret < 0; ++i) {
|
|
598
|
+
encCtx->sample_fmt = fmts[i];
|
|
599
|
+
AVDictionary *try_opts = nullptr;
|
|
600
|
+
snprintf(tmpbuf, sizeof(tmpbuf), "%d", encCtx->ch_layout.nb_channels > 0 ? encCtx->ch_layout.nb_channels : 1);
|
|
601
|
+
av_dict_set(&try_opts, "channels", tmpbuf, 0);
|
|
602
|
+
snprintf(tmpbuf, sizeof(tmpbuf), "%d", encCtx->sample_rate);
|
|
603
|
+
av_dict_set(&try_opts, "sample_rate", tmpbuf, 0);
|
|
604
|
+
if (encCtx->bit_rate > 0) { snprintf(tmpbuf, sizeof(tmpbuf), "%d", (int)encCtx->bit_rate); av_dict_set(&try_opts, "bit_rate", tmpbuf, 0); }
|
|
605
|
+
const char *sfname = av_get_sample_fmt_name(encCtx->sample_fmt);
|
|
606
|
+
if (sfname) av_dict_set(&try_opts, "sample_fmt", sfname, 0);
|
|
607
|
+
int r = avcodec_open2(encCtx, encoder, &try_opts);
|
|
608
|
+
if (r >= 0) {
|
|
609
|
+
if (try_opts) av_dict_free(&try_opts);
|
|
610
|
+
ret = r;
|
|
611
|
+
break;
|
|
612
|
+
}
|
|
613
|
+
if (try_opts) av_dict_free(&try_opts);
|
|
614
|
+
}
|
|
615
|
+
}
|
|
616
|
+
|
|
617
|
+
// Last resort: try S16 then S16P (for FLAC etc.)
|
|
618
|
+
if (ret < 0) {
|
|
619
|
+
AVSampleFormat fallbacks[] = { AV_SAMPLE_FMT_S16, AV_SAMPLE_FMT_S16P };
|
|
620
|
+
for (int fi = 0; fi < 2 && ret < 0; ++fi) {
|
|
621
|
+
encCtx->sample_fmt = fallbacks[fi];
|
|
622
|
+
AVDictionary *try_opts = nullptr;
|
|
623
|
+
snprintf(tmpbuf, sizeof(tmpbuf), "%d", encCtx->ch_layout.nb_channels > 0 ? encCtx->ch_layout.nb_channels : 1);
|
|
624
|
+
av_dict_set(&try_opts, "channels", tmpbuf, 0);
|
|
625
|
+
snprintf(tmpbuf, sizeof(tmpbuf), "%d", encCtx->sample_rate);
|
|
626
|
+
av_dict_set(&try_opts, "sample_rate", tmpbuf, 0);
|
|
627
|
+
if (encCtx->bit_rate > 0) { snprintf(tmpbuf, sizeof(tmpbuf), "%d", (int)encCtx->bit_rate); av_dict_set(&try_opts, "bit_rate", tmpbuf, 0); }
|
|
628
|
+
const char *sfname = av_get_sample_fmt_name(encCtx->sample_fmt);
|
|
629
|
+
if (sfname) av_dict_set(&try_opts, "sample_fmt", sfname, 0);
|
|
630
|
+
int r = avcodec_open2(encCtx, encoder, &try_opts);
|
|
631
|
+
if (r >= 0) {
|
|
632
|
+
if (try_opts) av_dict_free(&try_opts);
|
|
633
|
+
ret = r;
|
|
634
|
+
break;
|
|
635
|
+
}
|
|
636
|
+
if (try_opts) av_dict_free(&try_opts);
|
|
637
|
+
}
|
|
638
|
+
}
|
|
639
|
+
|
|
640
|
+
if (ret < 0) {
|
|
641
|
+
char eb[256]; av_strerror(ret, eb, sizeof(eb));
|
|
642
|
+
std::string msg = std::string("Failed to open encoder: ") + eb;
|
|
643
|
+
avcodec_free_context(&encCtx);
|
|
644
|
+
avformat_free_context(outFmt);
|
|
645
|
+
swr_free(&swr);
|
|
646
|
+
avcodec_free_context(&decCtx);
|
|
647
|
+
avformat_close_input(&inFmt);
|
|
648
|
+
return msg;
|
|
649
|
+
}
|
|
650
|
+
}
|
|
651
|
+
|
|
652
|
+
if (avcodec_parameters_from_context(outStream->codecpar, encCtx) < 0) {
|
|
653
|
+
avcodec_free_context(&encCtx);
|
|
654
|
+
avformat_free_context(outFmt);
|
|
655
|
+
swr_free(&swr);
|
|
656
|
+
avcodec_free_context(&decCtx);
|
|
657
|
+
avformat_close_input(&inFmt);
|
|
658
|
+
return std::string("Failed to set output stream parameters");
|
|
659
|
+
}
|
|
660
|
+
|
|
661
|
+
if (!(outFmt->oformat->flags & AVFMT_NOFILE)) {
|
|
662
|
+
if (avio_open(&outFmt->pb, outputPath, AVIO_FLAG_WRITE) < 0) {
|
|
663
|
+
avcodec_free_context(&encCtx);
|
|
664
|
+
avformat_free_context(outFmt);
|
|
665
|
+
swr_free(&swr);
|
|
666
|
+
avcodec_free_context(&decCtx);
|
|
667
|
+
avformat_close_input(&inFmt);
|
|
668
|
+
return std::string("Failed to open output file for writing");
|
|
669
|
+
}
|
|
670
|
+
}
|
|
671
|
+
|
|
672
|
+
if (avformat_write_header(outFmt, nullptr) < 0) {
|
|
673
|
+
if (!(outFmt->oformat->flags & AVFMT_NOFILE)) avio_closep(&outFmt->pb);
|
|
674
|
+
avcodec_free_context(&encCtx);
|
|
675
|
+
avformat_free_context(outFmt);
|
|
676
|
+
swr_free(&swr);
|
|
677
|
+
avcodec_free_context(&decCtx);
|
|
678
|
+
avformat_close_input(&inFmt);
|
|
679
|
+
return std::string("Failed to write output header");
|
|
680
|
+
}
|
|
681
|
+
|
|
682
|
+
AVPacket* pkt = av_packet_alloc();
|
|
683
|
+
AVFrame* frame = av_frame_alloc();
|
|
684
|
+
AVFrame* resampled = av_frame_alloc();
|
|
685
|
+
// Match encoder format/rate
|
|
686
|
+
resampled->format = encCtx->sample_fmt;
|
|
687
|
+
resampled->sample_rate = encCtx->sample_rate;
|
|
688
|
+
// ensure resampled frame has encoder channel layout
|
|
689
|
+
if (av_channel_layout_copy(&resampled->ch_layout, &encCtx->ch_layout) < 0) {
|
|
690
|
+
av_frame_free(&frame);
|
|
691
|
+
av_frame_free(&resampled);
|
|
692
|
+
av_packet_free(&pkt);
|
|
693
|
+
avcodec_free_context(&encCtx);
|
|
694
|
+
avformat_free_context(outFmt);
|
|
695
|
+
avcodec_free_context(&decCtx);
|
|
696
|
+
avformat_close_input(&inFmt);
|
|
697
|
+
return std::string("Failed to set resampled channel layout");
|
|
698
|
+
}
|
|
699
|
+
|
|
700
|
+
// Initialize resampler to convert from decoder format -> chosen encoder format
|
|
701
|
+
AVChannelLayout in_ch_layout2{};
|
|
702
|
+
if (inStream->codecpar->ch_layout.nb_channels) {
|
|
703
|
+
if (av_channel_layout_copy(&in_ch_layout2, &inStream->codecpar->ch_layout) < 0) {
|
|
704
|
+
av_channel_layout_uninit(&resampled->ch_layout);
|
|
705
|
+
av_frame_free(&frame);
|
|
706
|
+
av_frame_free(&resampled);
|
|
707
|
+
av_packet_free(&pkt);
|
|
708
|
+
avcodec_free_context(&encCtx);
|
|
709
|
+
avformat_free_context(outFmt);
|
|
710
|
+
swr_free(&swr);
|
|
711
|
+
avcodec_free_context(&decCtx);
|
|
712
|
+
avformat_close_input(&inFmt);
|
|
713
|
+
return std::string("Failed to copy input channel layout");
|
|
714
|
+
}
|
|
715
|
+
} else {
|
|
716
|
+
if (av_channel_layout_copy(&in_ch_layout2, &decCtx->ch_layout) < 0) {
|
|
717
|
+
av_channel_layout_uninit(&resampled->ch_layout);
|
|
718
|
+
av_frame_free(&frame);
|
|
719
|
+
av_frame_free(&resampled);
|
|
720
|
+
av_packet_free(&pkt);
|
|
721
|
+
avcodec_free_context(&encCtx);
|
|
722
|
+
avformat_free_context(outFmt);
|
|
723
|
+
swr_free(&swr);
|
|
724
|
+
avcodec_free_context(&decCtx);
|
|
725
|
+
avformat_close_input(&inFmt);
|
|
726
|
+
return std::string("Failed to init input channel layout");
|
|
727
|
+
}
|
|
728
|
+
}
|
|
729
|
+
if (swr_alloc_set_opts2(&swr,
|
|
730
|
+
&encCtx->ch_layout, encCtx->sample_fmt, encCtx->sample_rate,
|
|
731
|
+
&in_ch_layout2, (AVSampleFormat)decCtx->sample_fmt, decCtx->sample_rate,
|
|
732
|
+
0, nullptr) < 0 || !swr) {
|
|
733
|
+
av_channel_layout_uninit(&in_ch_layout2);
|
|
734
|
+
if (swr) swr_free(&swr);
|
|
735
|
+
av_channel_layout_uninit(&resampled->ch_layout);
|
|
736
|
+
av_frame_free(&frame);
|
|
737
|
+
av_frame_free(&resampled);
|
|
738
|
+
av_packet_free(&pkt);
|
|
739
|
+
avcodec_free_context(&encCtx);
|
|
740
|
+
avformat_free_context(outFmt);
|
|
741
|
+
avcodec_free_context(&decCtx);
|
|
742
|
+
avformat_close_input(&inFmt);
|
|
743
|
+
return std::string("Failed to initialize resampler");
|
|
744
|
+
}
|
|
745
|
+
av_channel_layout_uninit(&in_ch_layout2);
|
|
746
|
+
|
|
747
|
+
while (av_read_frame(inFmt, pkt) >= 0) {
|
|
748
|
+
if (pkt->stream_index == audioStreamIndex) {
|
|
749
|
+
if (avcodec_send_packet(decCtx, pkt) == 0) {
|
|
750
|
+
while (avcodec_receive_frame(decCtx, frame) == 0) {
|
|
751
|
+
int in_sr2 = inStream->codecpar->sample_rate ? inStream->codecpar->sample_rate : decCtx->sample_rate;
|
|
752
|
+
int64_t out_nb_samples = av_rescale_rnd(swr_get_delay(swr, in_sr2) + frame->nb_samples, encCtx->sample_rate, in_sr2, AV_ROUND_UP);
|
|
753
|
+
uint8_t** outData = nullptr;
|
|
754
|
+
int out_ch2 = encCtx->ch_layout.nb_channels;
|
|
755
|
+
if (out_ch2 <= 0) out_ch2 = 1;
|
|
756
|
+
if (av_samples_alloc_array_and_samples(&outData, nullptr, out_ch2, (int)out_nb_samples, encCtx->sample_fmt, 0) < 0) {
|
|
757
|
+
av_packet_unref(pkt);
|
|
758
|
+
continue;
|
|
759
|
+
}
|
|
760
|
+
int converted = swr_convert(swr, outData, (int)out_nb_samples, (const uint8_t**)frame->data, frame->nb_samples);
|
|
761
|
+
if (converted < 0) {
|
|
762
|
+
av_freep(&outData[0]);
|
|
763
|
+
av_freep(&outData);
|
|
764
|
+
continue;
|
|
765
|
+
}
|
|
766
|
+
|
|
767
|
+
resampled->nb_samples = converted;
|
|
768
|
+
if (av_frame_get_buffer(resampled, 0) < 0) {
|
|
769
|
+
av_freep(&outData[0]);
|
|
770
|
+
av_freep(&outData);
|
|
771
|
+
continue;
|
|
772
|
+
}
|
|
773
|
+
int bytes_per_sample = av_get_bytes_per_sample((AVSampleFormat)resampled->format);
|
|
774
|
+
int copy_size2 = converted * bytes_per_sample * out_ch2;
|
|
775
|
+
memcpy(resampled->data[0], outData[0], copy_size2);
|
|
776
|
+
|
|
777
|
+
if (avcodec_send_frame(encCtx, resampled) == 0) {
|
|
778
|
+
AVPacket* outPkt = av_packet_alloc();
|
|
779
|
+
while (avcodec_receive_packet(encCtx, outPkt) == 0) {
|
|
780
|
+
outPkt->stream_index = outStream->index;
|
|
781
|
+
av_packet_rescale_ts(outPkt, encCtx->time_base, outStream->time_base);
|
|
782
|
+
av_interleaved_write_frame(outFmt, outPkt);
|
|
783
|
+
av_packet_unref(outPkt);
|
|
784
|
+
}
|
|
785
|
+
av_packet_free(&outPkt);
|
|
786
|
+
}
|
|
787
|
+
|
|
788
|
+
av_freep(&outData[0]);
|
|
789
|
+
av_freep(&outData);
|
|
790
|
+
av_frame_unref(resampled);
|
|
791
|
+
av_frame_unref(frame);
|
|
792
|
+
}
|
|
793
|
+
}
|
|
794
|
+
}
|
|
795
|
+
av_packet_unref(pkt);
|
|
796
|
+
}
|
|
797
|
+
|
|
798
|
+
// Flush encoder
|
|
799
|
+
avcodec_send_frame(encCtx, nullptr);
|
|
800
|
+
AVPacket* outPkt2 = av_packet_alloc();
|
|
801
|
+
while (avcodec_receive_packet(encCtx, outPkt2) == 0) {
|
|
802
|
+
outPkt2->stream_index = outStream->index;
|
|
803
|
+
av_packet_rescale_ts(outPkt2, encCtx->time_base, outStream->time_base);
|
|
804
|
+
av_interleaved_write_frame(outFmt, outPkt2);
|
|
805
|
+
av_packet_unref(outPkt2);
|
|
806
|
+
}
|
|
807
|
+
av_packet_free(&outPkt2);
|
|
808
|
+
|
|
809
|
+
av_write_trailer(outFmt);
|
|
810
|
+
if (!(outFmt->oformat->flags & AVFMT_NOFILE)) avio_closep(&outFmt->pb);
|
|
811
|
+
|
|
812
|
+
av_packet_free(&pkt);
|
|
813
|
+
av_frame_free(&frame);
|
|
814
|
+
av_channel_layout_uninit(&resampled->ch_layout);
|
|
815
|
+
av_frame_free(&resampled);
|
|
816
|
+
|
|
817
|
+
swr_free(&swr);
|
|
818
|
+
avcodec_free_context(&encCtx);
|
|
819
|
+
avformat_free_context(outFmt);
|
|
820
|
+
avcodec_free_context(&decCtx);
|
|
821
|
+
avformat_close_input(&inFmt);
|
|
822
|
+
|
|
823
|
+
return std::string("");
|
|
824
|
+
#else
|
|
825
|
+
(void)inputPath; (void)outputPath; (void)formatHint;
|
|
826
|
+
return std::string("FFmpeg not available. Build prebuilts with third_party/ffmpeg_prebuilt/build_ffmpeg.ps1 or build_ffmpeg.sh.");
|
|
827
|
+
#endif
|
|
828
|
+
}
|
|
829
|
+
|
|
830
|
+
extern "C" {
|
|
831
|
+
|
|
832
|
+
// Called from Kotlin: SherpaOnnxModule.nativeConvertAudioToWav16k(inputPath, outputPath) -> Boolean
|
|
833
|
+
// or from a dedicated helper that returns an error string. We use a single JNI that returns a boolean
|
|
834
|
+
// and optionally pass back an error message via a separate call or out parameter.
|
|
835
|
+
// For simplicity we expose one method that returns a jstring: empty = success, non-empty = error message.
|
|
836
|
+
JNIEXPORT jstring JNICALL
|
|
837
|
+
Java_com_sherpaonnx_SherpaOnnxModule_nativeConvertAudioToWav16k(
|
|
838
|
+
JNIEnv* env,
|
|
839
|
+
jobject /* this */,
|
|
840
|
+
jstring inputPath,
|
|
841
|
+
jstring outputPath) {
|
|
842
|
+
if (inputPath == nullptr || outputPath == nullptr) {
|
|
843
|
+
return env->NewStringUTF("inputPath and outputPath must be non-null");
|
|
844
|
+
}
|
|
845
|
+
const char* input = env->GetStringUTFChars(inputPath, nullptr);
|
|
846
|
+
const char* output = env->GetStringUTFChars(outputPath, nullptr);
|
|
847
|
+
if (input == nullptr || output == nullptr) {
|
|
848
|
+
if (input) env->ReleaseStringUTFChars(inputPath, input);
|
|
849
|
+
if (output) env->ReleaseStringUTFChars(outputPath, output);
|
|
850
|
+
return env->NewStringUTF("Failed to get path strings");
|
|
851
|
+
}
|
|
852
|
+
std::string err = convertToWav16kMono(input, output);
|
|
853
|
+
env->ReleaseStringUTFChars(inputPath, input);
|
|
854
|
+
env->ReleaseStringUTFChars(outputPath, output);
|
|
855
|
+
return env->NewStringUTF(err.c_str());
|
|
856
|
+
}
|
|
857
|
+
|
|
858
|
+
JNIEXPORT jstring JNICALL
|
|
859
|
+
Java_com_sherpaonnx_SherpaOnnxModule_nativeConvertAudioToFormat(
|
|
860
|
+
JNIEnv* env,
|
|
861
|
+
jobject /* this */,
|
|
862
|
+
jstring inputPath,
|
|
863
|
+
jstring outputPath,
|
|
864
|
+
jstring formatHint,
|
|
865
|
+
jint outputSampleRateHz) {
|
|
866
|
+
if (inputPath == nullptr || outputPath == nullptr || formatHint == nullptr) {
|
|
867
|
+
return env->NewStringUTF("inputPath, outputPath and formatHint must be non-null");
|
|
868
|
+
}
|
|
869
|
+
const char* input = env->GetStringUTFChars(inputPath, nullptr);
|
|
870
|
+
const char* output = env->GetStringUTFChars(outputPath, nullptr);
|
|
871
|
+
const char* fmt = env->GetStringUTFChars(formatHint, nullptr);
|
|
872
|
+
if (input == nullptr || output == nullptr || fmt == nullptr) {
|
|
873
|
+
if (input) env->ReleaseStringUTFChars(inputPath, input);
|
|
874
|
+
if (output) env->ReleaseStringUTFChars(outputPath, output);
|
|
875
|
+
if (fmt) env->ReleaseStringUTFChars(formatHint, fmt);
|
|
876
|
+
return env->NewStringUTF("Failed to get path/format strings");
|
|
877
|
+
}
|
|
878
|
+
|
|
879
|
+
std::string err = convertToFormat(input, output, fmt, (int)outputSampleRateHz);
|
|
880
|
+
|
|
881
|
+
env->ReleaseStringUTFChars(inputPath, input);
|
|
882
|
+
env->ReleaseStringUTFChars(outputPath, output);
|
|
883
|
+
env->ReleaseStringUTFChars(formatHint, fmt);
|
|
884
|
+
|
|
885
|
+
return env->NewStringUTF(err.c_str());
|
|
886
|
+
}
|
|
887
|
+
|
|
888
|
+
} // extern "C"
|