react-native-sherpa-onnx 0.2.0 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (175) hide show
  1. package/README.md +232 -236
  2. package/SherpaOnnx.podspec +68 -64
  3. package/android/build.gradle +182 -192
  4. package/android/codegen.gradle +57 -0
  5. package/android/prebuilt-download.gradle +428 -0
  6. package/android/prebuilt-versions.gradle +43 -0
  7. package/android/proguard-rules.pro +10 -0
  8. package/android/src/main/assets/testModels/add_mul_add.onnx +28 -0
  9. package/android/src/main/assets/testModels/nnapi_internal_uint8_support.onnx +0 -0
  10. package/android/src/main/assets/testModels/qnn_multi_ctx_embed.onnx +0 -0
  11. package/android/src/main/cpp/CMakeLists.txt +166 -129
  12. package/android/src/main/cpp/CMakePresets.json +54 -0
  13. package/android/src/main/cpp/crypto/sha256.cpp +174 -0
  14. package/android/src/main/cpp/crypto/sha256.h +16 -0
  15. package/android/src/main/cpp/jni/archive/sherpa-onnx-archive-helper.cpp +404 -0
  16. package/android/src/main/cpp/jni/archive/sherpa-onnx-archive-helper.h +56 -0
  17. package/android/src/main/cpp/jni/archive/sherpa-onnx-archive-jni.cpp +181 -0
  18. package/android/src/main/cpp/jni/audio/sherpa-onnx-audio-convert-jni.cpp +888 -0
  19. package/{ios → android/src/main/cpp/jni/model_detect}/sherpa-onnx-common.h +18 -18
  20. package/android/src/main/cpp/jni/model_detect/sherpa-onnx-detect-jni-common.cpp +86 -0
  21. package/android/src/main/cpp/jni/model_detect/sherpa-onnx-detect-jni-common.h +20 -0
  22. package/android/src/main/cpp/jni/model_detect/sherpa-onnx-model-detect-helper.cpp +423 -0
  23. package/android/src/main/cpp/jni/model_detect/sherpa-onnx-model-detect-helper.h +55 -0
  24. package/android/src/main/cpp/jni/model_detect/sherpa-onnx-model-detect-stt.cpp +399 -0
  25. package/android/src/main/cpp/jni/model_detect/sherpa-onnx-model-detect-tts.cpp +238 -0
  26. package/{ios → android/src/main/cpp/jni/model_detect}/sherpa-onnx-model-detect.h +122 -89
  27. package/android/src/main/cpp/jni/model_detect/sherpa-onnx-stt-wrapper.cpp +99 -0
  28. package/android/src/main/cpp/jni/model_detect/sherpa-onnx-stt-wrapper.h +16 -0
  29. package/android/src/main/cpp/jni/model_detect/sherpa-onnx-tts-wrapper.cpp +78 -0
  30. package/android/src/main/cpp/jni/model_detect/sherpa-onnx-tts-wrapper.h +16 -0
  31. package/android/src/main/cpp/jni/module/sherpa-onnx-module-jni.cpp +190 -0
  32. package/android/src/main/cpp/jni/tts/sherpa-onnx-tts-zipvoice-jni.cpp +301 -0
  33. package/android/src/main/java/com/sherpaonnx/SherpaOnnxArchiveHelper.kt +94 -0
  34. package/android/src/main/java/com/sherpaonnx/{SherpaOnnxCoreHelper.kt → SherpaOnnxAssetHelper.kt} +350 -236
  35. package/android/src/main/java/com/sherpaonnx/SherpaOnnxModule.kt +791 -483
  36. package/android/src/main/java/com/sherpaonnx/SherpaOnnxSttHelper.kt +699 -109
  37. package/android/src/main/java/com/sherpaonnx/SherpaOnnxTtsHelper.kt +1123 -668
  38. package/android/src/main/java/com/sherpaonnx/ZipvoiceTtsWrapper.kt +187 -0
  39. package/ios/SherpaOnnx+Assets.h +11 -0
  40. package/ios/SherpaOnnx+Assets.mm +325 -0
  41. package/ios/SherpaOnnx+STT.mm +455 -118
  42. package/ios/SherpaOnnx+TTS.mm +1101 -712
  43. package/ios/SherpaOnnx.h +17 -6
  44. package/ios/SherpaOnnx.mm +206 -311
  45. package/ios/SherpaOnnx.xcconfig +19 -19
  46. package/ios/SherpaOnnxCoreMLHelper.swift +24 -0
  47. package/ios/archive/sherpa-onnx-archive-helper.h +21 -0
  48. package/ios/archive/sherpa-onnx-archive-helper.mm +296 -0
  49. package/ios/libarchive_darwin_config.h +153 -0
  50. package/{android/src/main/cpp/jni → ios/model_detect}/sherpa-onnx-common.h +18 -18
  51. package/ios/model_detect/sherpa-onnx-model-detect-helper.h +49 -0
  52. package/ios/model_detect/sherpa-onnx-model-detect-helper.mm +210 -0
  53. package/ios/model_detect/sherpa-onnx-model-detect-stt.mm +344 -0
  54. package/ios/model_detect/sherpa-onnx-model-detect-tts.mm +201 -0
  55. package/{android/src/main/cpp/jni → ios/model_detect}/sherpa-onnx-model-detect.h +117 -89
  56. package/ios/scripts/patch-libarchive-includes.sh +61 -0
  57. package/ios/scripts/setup-ios-libarchive.sh +98 -0
  58. package/ios/stt/sherpa-onnx-stt-wrapper.h +129 -0
  59. package/ios/stt/sherpa-onnx-stt-wrapper.mm +523 -0
  60. package/ios/{sherpa-onnx-tts-wrapper.h → tts/sherpa-onnx-tts-wrapper.h} +90 -85
  61. package/ios/{sherpa-onnx-tts-wrapper.mm → tts/sherpa-onnx-tts-wrapper.mm} +376 -345
  62. package/lib/module/NativeSherpaOnnx.js +3 -0
  63. package/lib/module/NativeSherpaOnnx.js.map +1 -1
  64. package/lib/module/audio/index.js +22 -0
  65. package/lib/module/audio/index.js.map +1 -0
  66. package/lib/module/diarization/index.js +1 -1
  67. package/lib/module/diarization/index.js.map +1 -1
  68. package/lib/module/download/ModelDownloadManager.js +918 -0
  69. package/lib/module/download/ModelDownloadManager.js.map +1 -0
  70. package/lib/module/download/extractTarBz2.js +53 -0
  71. package/lib/module/download/extractTarBz2.js.map +1 -0
  72. package/lib/module/download/index.js +6 -0
  73. package/lib/module/download/index.js.map +1 -0
  74. package/lib/module/download/validation.js +178 -0
  75. package/lib/module/download/validation.js.map +1 -0
  76. package/lib/module/enhancement/index.js +1 -1
  77. package/lib/module/enhancement/index.js.map +1 -1
  78. package/lib/module/index.js +41 -3
  79. package/lib/module/index.js.map +1 -1
  80. package/lib/module/separation/index.js +1 -1
  81. package/lib/module/separation/index.js.map +1 -1
  82. package/lib/module/stt/index.js +127 -60
  83. package/lib/module/stt/index.js.map +1 -1
  84. package/lib/module/stt/sttModelLanguages.js +512 -0
  85. package/lib/module/stt/sttModelLanguages.js.map +1 -0
  86. package/lib/module/stt/types.js +53 -1
  87. package/lib/module/stt/types.js.map +1 -1
  88. package/lib/module/tts/index.js +216 -289
  89. package/lib/module/tts/index.js.map +1 -1
  90. package/lib/module/tts/types.js +86 -1
  91. package/lib/module/tts/types.js.map +1 -1
  92. package/lib/module/types.js.map +1 -1
  93. package/lib/module/utils.js +86 -73
  94. package/lib/module/utils.js.map +1 -1
  95. package/lib/module/vad/index.js +1 -1
  96. package/lib/module/vad/index.js.map +1 -1
  97. package/lib/typescript/src/NativeSherpaOnnx.d.ts +192 -38
  98. package/lib/typescript/src/NativeSherpaOnnx.d.ts.map +1 -1
  99. package/lib/typescript/src/audio/index.d.ts +13 -0
  100. package/lib/typescript/src/audio/index.d.ts.map +1 -0
  101. package/lib/typescript/src/diarization/index.d.ts +3 -2
  102. package/lib/typescript/src/diarization/index.d.ts.map +1 -1
  103. package/lib/typescript/src/download/ModelDownloadManager.d.ts +108 -0
  104. package/lib/typescript/src/download/ModelDownloadManager.d.ts.map +1 -0
  105. package/lib/typescript/src/download/extractTarBz2.d.ts +14 -0
  106. package/lib/typescript/src/download/extractTarBz2.d.ts.map +1 -0
  107. package/lib/typescript/src/download/index.d.ts +7 -0
  108. package/lib/typescript/src/download/index.d.ts.map +1 -0
  109. package/lib/typescript/src/download/validation.d.ts +57 -0
  110. package/lib/typescript/src/download/validation.d.ts.map +1 -0
  111. package/lib/typescript/src/enhancement/index.d.ts +3 -2
  112. package/lib/typescript/src/enhancement/index.d.ts.map +1 -1
  113. package/lib/typescript/src/index.d.ts +26 -2
  114. package/lib/typescript/src/index.d.ts.map +1 -1
  115. package/lib/typescript/src/separation/index.d.ts +3 -2
  116. package/lib/typescript/src/separation/index.d.ts.map +1 -1
  117. package/lib/typescript/src/stt/index.d.ts +31 -43
  118. package/lib/typescript/src/stt/index.d.ts.map +1 -1
  119. package/lib/typescript/src/stt/sttModelLanguages.d.ts +52 -0
  120. package/lib/typescript/src/stt/sttModelLanguages.d.ts.map +1 -0
  121. package/lib/typescript/src/stt/types.d.ts +196 -9
  122. package/lib/typescript/src/stt/types.d.ts.map +1 -1
  123. package/lib/typescript/src/tts/index.d.ts +25 -211
  124. package/lib/typescript/src/tts/index.d.ts.map +1 -1
  125. package/lib/typescript/src/tts/types.d.ts +148 -25
  126. package/lib/typescript/src/tts/types.d.ts.map +1 -1
  127. package/lib/typescript/src/types.d.ts +0 -32
  128. package/lib/typescript/src/types.d.ts.map +1 -1
  129. package/lib/typescript/src/utils.d.ts +28 -13
  130. package/lib/typescript/src/utils.d.ts.map +1 -1
  131. package/lib/typescript/src/vad/index.d.ts +3 -2
  132. package/lib/typescript/src/vad/index.d.ts.map +1 -1
  133. package/package.json +250 -222
  134. package/scripts/check-qnn-support.sh +78 -0
  135. package/scripts/setup-ios-framework.sh +379 -282
  136. package/src/NativeSherpaOnnx.ts +474 -251
  137. package/src/audio/index.ts +32 -0
  138. package/src/diarization/index.ts +4 -2
  139. package/src/download/ModelDownloadManager.ts +1325 -0
  140. package/src/download/extractTarBz2.ts +78 -0
  141. package/src/download/index.ts +43 -0
  142. package/src/download/validation.ts +279 -0
  143. package/src/enhancement/index.ts +4 -2
  144. package/src/index.tsx +78 -27
  145. package/src/separation/index.ts +4 -2
  146. package/src/stt/index.ts +249 -89
  147. package/src/stt/sttModelLanguages.ts +237 -0
  148. package/src/stt/types.ts +263 -9
  149. package/src/tts/index.ts +470 -458
  150. package/src/tts/types.ts +373 -218
  151. package/src/types.ts +0 -44
  152. package/src/utils.ts +145 -131
  153. package/src/vad/index.ts +4 -2
  154. package/third_party/ffmpeg_prebuilt/ANDROID_RELEASE_TAG +1 -0
  155. package/third_party/libarchive_prebuilt/ANDROID_RELEASE_TAG +1 -0
  156. package/third_party/libarchive_prebuilt/IOS_RELEASE_TAG +1 -0
  157. package/third_party/sherpa-onnx-prebuilt/ANDROID_RELEASE_TAG +1 -0
  158. package/third_party/sherpa-onnx-prebuilt/IOS_RELEASE_TAG +1 -0
  159. package/android/src/main/cpp/include/sherpa-onnx/c-api/c-api.h +0 -1918
  160. package/android/src/main/cpp/include/sherpa-onnx/c-api/cxx-api.h +0 -841
  161. package/android/src/main/cpp/jni/sherpa-onnx-model-detect.cpp +0 -541
  162. package/android/src/main/cpp/jni/sherpa-onnx-stt-jni.cpp +0 -336
  163. package/android/src/main/cpp/jni/sherpa-onnx-stt-wrapper.cpp +0 -222
  164. package/android/src/main/cpp/jni/sherpa-onnx-stt-wrapper.h +0 -68
  165. package/android/src/main/cpp/jni/sherpa-onnx-tts-jni.cpp +0 -823
  166. package/android/src/main/cpp/jni/sherpa-onnx-tts-wrapper.cpp +0 -387
  167. package/android/src/main/cpp/jni/sherpa-onnx-tts-wrapper.h +0 -147
  168. package/ios/Frameworks/sherpa_onnx.xcframework.zip +0 -0
  169. package/ios/include/sherpa-onnx/c-api/c-api.h +0 -1918
  170. package/ios/include/sherpa-onnx/c-api/cxx-api.h +0 -841
  171. package/ios/sherpa-onnx-model-detect.mm +0 -441
  172. package/ios/sherpa-onnx-stt-wrapper.h +0 -48
  173. package/ios/sherpa-onnx-stt-wrapper.mm +0 -201
  174. package/scripts/copy-headers.js +0 -184
  175. package/scripts/setup-assets.js +0 -323
@@ -0,0 +1,888 @@
1
+ /**
2
+ * sherpa-onnx-audio-convert-jni.cpp
3
+ *
4
+ * Purpose: JNI for converting arbitrary audio files to WAV 16 kHz mono 16-bit PCM (sherpa-onnx
5
+ * input format). When HAVE_FFMPEG is set, FFmpeg is used; otherwise nativeConvertAudioToWav16k
6
+ * returns an error. Used by the Kotlin audio conversion API.
7
+ */
8
+ #include <android/log.h>
9
+ #include <jni.h>
10
+ #include <string>
11
+
12
+ #define LOG_TAG "AudioConvertJNI"
13
+ #define LOGI(...) __android_log_print(ANDROID_LOG_INFO, LOG_TAG, __VA_ARGS__)
14
+ #define LOGW(...) __android_log_print(ANDROID_LOG_WARN, LOG_TAG, __VA_ARGS__)
15
+ #define LOGE(...) __android_log_print(ANDROID_LOG_ERROR, LOG_TAG, __VA_ARGS__)
16
+
17
+ #ifdef HAVE_FFMPEG
18
+ extern "C" {
19
+ #include <libavcodec/avcodec.h>
20
+ #include <libavformat/avformat.h>
21
+ #include <libavutil/opt.h>
22
+ #include <libswresample/swresample.h>
23
+ }
24
+ #include <cstdio>
25
+ #include <vector>
26
+ #endif
27
+
28
+ // Returns empty string on success, or error message on failure.
29
+ // Output is always 16 kHz mono 16-bit PCM (sherpa-onnx requirement). Input can be any rate; we resample to 16k.
30
+ static std::string convertToWav16kMono(const char* inputPath, const char* outputPath) {
31
+ #ifdef HAVE_FFMPEG
32
+ // Implement a basic decode -> resample -> write WAV pipeline using libav* APIs.
33
+ av_log_set_level(AV_LOG_ERROR);
34
+
35
+ AVFormatContext* inFmt = nullptr;
36
+ if (avformat_open_input(&inFmt, inputPath, nullptr, nullptr) < 0) {
37
+ return std::string("Failed to open input file");
38
+ }
39
+ if (avformat_find_stream_info(inFmt, nullptr) < 0) {
40
+ avformat_close_input(&inFmt);
41
+ return std::string("Failed to find stream info");
42
+ }
43
+
44
+ int audioStreamIndex = -1;
45
+ for (unsigned i = 0; i < inFmt->nb_streams; ++i) {
46
+ if (inFmt->streams[i]->codecpar->codec_type == AVMEDIA_TYPE_AUDIO) {
47
+ audioStreamIndex = i;
48
+ break;
49
+ }
50
+ }
51
+ if (audioStreamIndex < 0) {
52
+ avformat_close_input(&inFmt);
53
+ return std::string("No audio stream found in input");
54
+ }
55
+
56
+ AVStream* inStream = inFmt->streams[audioStreamIndex];
57
+ const AVCodec* decoder = avcodec_find_decoder(inStream->codecpar->codec_id);
58
+ if (!decoder) {
59
+ avformat_close_input(&inFmt);
60
+ return std::string("Unsupported input codec");
61
+ }
62
+
63
+ AVCodecContext* decCtx = avcodec_alloc_context3(decoder);
64
+ if (!decCtx) {
65
+ avformat_close_input(&inFmt);
66
+ return std::string("Failed to allocate decoder context");
67
+ }
68
+ if (avcodec_parameters_to_context(decCtx, inStream->codecpar) < 0) {
69
+ avcodec_free_context(&decCtx);
70
+ avformat_close_input(&inFmt);
71
+ return std::string("Failed to copy codec parameters");
72
+ }
73
+ if (avcodec_open2(decCtx, decoder, nullptr) < 0) {
74
+ avcodec_free_context(&decCtx);
75
+ avformat_close_input(&inFmt);
76
+ return std::string("Failed to open decoder");
77
+ }
78
+
79
+ // Prepare resampler to 16k mono s16 using AVChannelLayout helpers
80
+ SwrContext* swr = nullptr;
81
+ AVChannelLayout out_ch_layout = AV_CHANNEL_LAYOUT_MONO;
82
+ AVChannelLayout in_ch_layout;
83
+ // Prefer codecpar ch_layout when available, otherwise fall back to decoder ctx
84
+ if (inStream->codecpar->ch_layout.nb_channels) {
85
+ if (av_channel_layout_copy(&in_ch_layout, &inStream->codecpar->ch_layout) < 0) {
86
+ avcodec_free_context(&decCtx);
87
+ avformat_close_input(&inFmt);
88
+ return std::string("Failed to copy input channel layout");
89
+ }
90
+ } else {
91
+ if (av_channel_layout_copy(&in_ch_layout, &decCtx->ch_layout) < 0) {
92
+ avcodec_free_context(&decCtx);
93
+ avformat_close_input(&inFmt);
94
+ return std::string("Failed to initialize input channel layout");
95
+ }
96
+ }
97
+ if (swr_alloc_set_opts2(&swr,
98
+ &out_ch_layout, AV_SAMPLE_FMT_S16, 16000,
99
+ &in_ch_layout, (AVSampleFormat)decCtx->sample_fmt, decCtx->sample_rate,
100
+ 0, nullptr) < 0 || !swr) {
101
+ av_channel_layout_uninit(&in_ch_layout);
102
+ if (swr) swr_free(&swr);
103
+ avcodec_free_context(&decCtx);
104
+ avformat_close_input(&inFmt);
105
+ return std::string("Failed to initialize resampler");
106
+ }
107
+ av_channel_layout_uninit(&in_ch_layout);
108
+
109
+ // Prepare output WAV via avformat
110
+ AVFormatContext* outFmt = nullptr;
111
+ if (avformat_alloc_output_context2(&outFmt, nullptr, nullptr, outputPath) < 0 || !outFmt) {
112
+ swr_free(&swr);
113
+ avcodec_free_context(&decCtx);
114
+ avformat_close_input(&inFmt);
115
+ return std::string("Failed to allocate output context");
116
+ }
117
+
118
+ const AVCodec* pcmCodec = avcodec_find_encoder(AV_CODEC_ID_PCM_S16LE);
119
+ if (!pcmCodec) {
120
+ avformat_free_context(outFmt);
121
+ swr_free(&swr);
122
+ avcodec_free_context(&decCtx);
123
+ avformat_close_input(&inFmt);
124
+ return std::string("PCM encoder not found");
125
+ }
126
+
127
+ AVStream* outStream = avformat_new_stream(outFmt, nullptr);
128
+ if (!outStream) {
129
+ avformat_free_context(outFmt);
130
+ swr_free(&swr);
131
+ avcodec_free_context(&decCtx);
132
+ avformat_close_input(&inFmt);
133
+ return std::string("Failed to create output stream");
134
+ }
135
+
136
+ AVCodecContext* encCtx = avcodec_alloc_context3(pcmCodec);
137
+ // Configure encoder context for mono 16k s16 output
138
+ AVChannelLayout mono_layout = AV_CHANNEL_LAYOUT_MONO;
139
+ if (!encCtx) {
140
+ avformat_free_context(outFmt);
141
+ swr_free(&swr);
142
+ avcodec_free_context(&decCtx);
143
+ avformat_close_input(&inFmt);
144
+ return std::string("Failed to allocate encoder context");
145
+ }
146
+ if (av_channel_layout_copy(&encCtx->ch_layout, &mono_layout) < 0) {
147
+ avcodec_free_context(&encCtx);
148
+ avformat_free_context(outFmt);
149
+ swr_free(&swr);
150
+ avcodec_free_context(&decCtx);
151
+ avformat_close_input(&inFmt);
152
+ return std::string("Failed to set encoder channel layout");
153
+ }
154
+ encCtx->sample_rate = 16000;
155
+ encCtx->sample_fmt = AV_SAMPLE_FMT_S16;
156
+ encCtx->bit_rate = 16 * 16000; // rough
157
+
158
+ if (outFmt->oformat->flags & AVFMT_GLOBALHEADER) encCtx->flags |= AV_CODEC_FLAG_GLOBAL_HEADER;
159
+
160
+ if (avcodec_open2(encCtx, pcmCodec, nullptr) < 0) {
161
+ avcodec_free_context(&encCtx);
162
+ avformat_free_context(outFmt);
163
+ swr_free(&swr);
164
+ avcodec_free_context(&decCtx);
165
+ avformat_close_input(&inFmt);
166
+ return std::string("Failed to open PCM encoder");
167
+ }
168
+
169
+ if (avcodec_parameters_from_context(outStream->codecpar, encCtx) < 0) {
170
+ avcodec_free_context(&encCtx);
171
+ avformat_free_context(outFmt);
172
+ swr_free(&swr);
173
+ avcodec_free_context(&decCtx);
174
+ avformat_close_input(&inFmt);
175
+ return std::string("Failed to set output stream parameters");
176
+ }
177
+
178
+ if (!(outFmt->oformat->flags & AVFMT_NOFILE)) {
179
+ if (avio_open(&outFmt->pb, outputPath, AVIO_FLAG_WRITE) < 0) {
180
+ avcodec_free_context(&encCtx);
181
+ avformat_free_context(outFmt);
182
+ swr_free(&swr);
183
+ avcodec_free_context(&decCtx);
184
+ avformat_close_input(&inFmt);
185
+ return std::string("Failed to open output file for writing");
186
+ }
187
+ }
188
+
189
+ if (avformat_write_header(outFmt, nullptr) < 0) {
190
+ if (!(outFmt->oformat->flags & AVFMT_NOFILE)) avio_closep(&outFmt->pb);
191
+ avcodec_free_context(&encCtx);
192
+ avformat_free_context(outFmt);
193
+ swr_free(&swr);
194
+ avcodec_free_context(&decCtx);
195
+ avformat_close_input(&inFmt);
196
+ return std::string("Failed to write output header");
197
+ }
198
+
199
+ AVPacket* pkt = av_packet_alloc();
200
+ AVFrame* frame = av_frame_alloc();
201
+ AVFrame* resampled = av_frame_alloc();
202
+ // Configure resampled frame metadata
203
+ resampled->format = AV_SAMPLE_FMT_S16;
204
+ resampled->sample_rate = 16000;
205
+ // set channel layout on frame
206
+ AVChannelLayout out_ch_layout_local = AV_CHANNEL_LAYOUT_MONO;
207
+ if (av_channel_layout_copy(&resampled->ch_layout, &out_ch_layout_local) < 0) {
208
+ av_frame_free(&frame);
209
+ av_frame_free(&resampled);
210
+ swr_free(&swr);
211
+ avcodec_free_context(&decCtx);
212
+ avformat_close_input(&inFmt);
213
+ return std::string("Failed to set resampled frame channel layout");
214
+ }
215
+
216
+ // Buffer for resampled data will be allocated per needed samples
217
+
218
+ while (av_read_frame(inFmt, pkt) >= 0) {
219
+ if (pkt->stream_index == audioStreamIndex) {
220
+ if (avcodec_send_packet(decCtx, pkt) == 0) {
221
+ while (avcodec_receive_frame(decCtx, frame) == 0) {
222
+ // Resample
223
+ int in_sr = inStream->codecpar->sample_rate ? inStream->codecpar->sample_rate : decCtx->sample_rate;
224
+ int64_t out_nb_samples = av_rescale_rnd(swr_get_delay(swr, in_sr) + frame->nb_samples, 16000, in_sr, AV_ROUND_UP);
225
+ uint8_t** outData = nullptr;
226
+ int out_channels = resampled->ch_layout.nb_channels;
227
+ if (out_channels <= 0) out_channels = 1;
228
+ if (av_samples_alloc_array_and_samples(&outData, nullptr, out_channels, (int)out_nb_samples, AV_SAMPLE_FMT_S16, 0) < 0) {
229
+ av_packet_unref(pkt);
230
+ continue;
231
+ }
232
+ int converted = swr_convert(swr, outData, (int)out_nb_samples, (const uint8_t**)frame->data, frame->nb_samples);
233
+ if (converted < 0) {
234
+ av_freep(&outData[0]);
235
+ av_freep(&outData);
236
+ continue;
237
+ }
238
+
239
+ // prepare frame for encoder
240
+ resampled->nb_samples = converted;
241
+ if (av_frame_get_buffer(resampled, 0) < 0) {
242
+ av_freep(&outData[0]);
243
+ av_freep(&outData);
244
+ continue;
245
+ }
246
+ // copy data into resampled frame
247
+ int bytes_per_sample = av_get_bytes_per_sample((AVSampleFormat)resampled->format);
248
+ int copy_size = converted * bytes_per_sample * out_channels;
249
+ memcpy(resampled->data[0], outData[0], copy_size);
250
+
251
+ // send to encoder
252
+ if (avcodec_send_frame(encCtx, resampled) == 0) {
253
+ AVPacket* outPkt = av_packet_alloc();
254
+ while (avcodec_receive_packet(encCtx, outPkt) == 0) {
255
+ outPkt->stream_index = outStream->index;
256
+ av_packet_rescale_ts(outPkt, encCtx->time_base, outStream->time_base);
257
+ av_interleaved_write_frame(outFmt, outPkt);
258
+ av_packet_unref(outPkt);
259
+ }
260
+ av_packet_free(&outPkt);
261
+ }
262
+
263
+ av_freep(&outData[0]);
264
+ av_freep(&outData);
265
+ av_frame_unref(resampled);
266
+ av_frame_unref(frame);
267
+ }
268
+ }
269
+ }
270
+ av_packet_unref(pkt);
271
+ }
272
+
273
+ // Flush encoder
274
+ avcodec_send_frame(encCtx, nullptr);
275
+ AVPacket* outPkt = av_packet_alloc();
276
+ while (avcodec_receive_packet(encCtx, outPkt) == 0) {
277
+ outPkt->stream_index = outStream->index;
278
+ av_packet_rescale_ts(outPkt, encCtx->time_base, outStream->time_base);
279
+ av_interleaved_write_frame(outFmt, outPkt);
280
+ av_packet_unref(outPkt);
281
+ }
282
+ av_packet_free(&outPkt);
283
+
284
+ av_write_trailer(outFmt);
285
+ if (!(outFmt->oformat->flags & AVFMT_NOFILE)) avio_closep(&outFmt->pb);
286
+
287
+ av_packet_free(&pkt);
288
+ av_frame_free(&frame);
289
+ av_channel_layout_uninit(&resampled->ch_layout);
290
+ av_frame_free(&resampled);
291
+
292
+ swr_free(&swr);
293
+ avcodec_free_context(&encCtx);
294
+ avformat_free_context(outFmt);
295
+ avcodec_free_context(&decCtx);
296
+ avformat_close_input(&inFmt);
297
+
298
+ return std::string("");
299
+ #else
300
+ (void)inputPath;
301
+ (void)outputPath;
302
+ return "FFmpeg not available. Build prebuilts with third_party/ffmpeg_prebuilt/build_ffmpeg.ps1 or build_ffmpeg.sh.";
303
+ #endif
304
+ }
305
+
306
+ // Generic conversion: supports writing WAV/MP3/FLAC depending on output file extension and linked encoders.
307
+ // WAV path always uses convertToWav16kMono (16 kHz mono out for sherpa-onnx). outputSampleRateHz is only used for MP3 (libshine: 32000/44100/48000); 0 = default 44100.
308
+ static std::string convertToFormat(const char* inputPath, const char* outputPath, const char* formatHint, int outputSampleRateHz) {
309
+ #ifdef HAVE_FFMPEG
310
+ // WAV output is always 16 kHz mono via convertToWav16kMono (sherpa-onnx). Input WAV at 16k is resampled 16k->16k (no change).
311
+ std::string fmt(formatHint ? formatHint : "");
312
+ if (fmt == "wav" || fmt == "wav16k") {
313
+ return convertToWav16kMono(inputPath, outputPath);
314
+ }
315
+
316
+ // Try to determine codec id from format hint
317
+ AVCodecID codec_id = AV_CODEC_ID_NONE;
318
+ if (fmt == "mp3") codec_id = AV_CODEC_ID_MP3;
319
+ else if (fmt == "flac") codec_id = AV_CODEC_ID_FLAC;
320
+ else {
321
+ // fallback to WAV
322
+ return convertToWav16kMono(inputPath, outputPath);
323
+ }
324
+
325
+ // The implementation for generic encoding uses the same decode+resample pipeline
326
+ // but selects encoder by codec_id and creates an output container based on file extension.
327
+ // For brevity we reuse much of the WAV path but change encoder selection.
328
+
329
+ // Open input
330
+ AVFormatContext* inFmt = nullptr;
331
+ if (avformat_open_input(&inFmt, inputPath, nullptr, nullptr) < 0) {
332
+ return std::string("Failed to open input file");
333
+ }
334
+ if (avformat_find_stream_info(inFmt, nullptr) < 0) {
335
+ avformat_close_input(&inFmt);
336
+ return std::string("Failed to find stream info");
337
+ }
338
+
339
+ int audioStreamIndex = -1;
340
+ for (unsigned i = 0; i < inFmt->nb_streams; ++i) {
341
+ if (inFmt->streams[i]->codecpar->codec_type == AVMEDIA_TYPE_AUDIO) {
342
+ audioStreamIndex = i;
343
+ break;
344
+ }
345
+ }
346
+ if (audioStreamIndex < 0) {
347
+ avformat_close_input(&inFmt);
348
+ return std::string("No audio stream found in input");
349
+ }
350
+
351
+ AVStream* inStream = inFmt->streams[audioStreamIndex];
352
+ const AVCodec* decoder = avcodec_find_decoder(inStream->codecpar->codec_id);
353
+ if (!decoder) {
354
+ avformat_close_input(&inFmt);
355
+ return std::string("Unsupported input codec");
356
+ }
357
+
358
+ AVCodecContext* decCtx = avcodec_alloc_context3(decoder);
359
+ if (!decCtx) {
360
+ avformat_close_input(&inFmt);
361
+ return std::string("Failed to allocate decoder context");
362
+ }
363
+ if (avcodec_parameters_to_context(decCtx, inStream->codecpar) < 0) {
364
+ avcodec_free_context(&decCtx);
365
+ avformat_close_input(&inFmt);
366
+ return std::string("Failed to copy codec parameters");
367
+ }
368
+ if (avcodec_open2(decCtx, decoder, nullptr) < 0) {
369
+ avcodec_free_context(&decCtx);
370
+ avformat_close_input(&inFmt);
371
+ return std::string("Failed to open decoder");
372
+ }
373
+
374
+ // We'll configure resampler later based on encoder requirements.
375
+ SwrContext* swr = nullptr;
376
+
377
+ AVFormatContext* outFmt = nullptr;
378
+ if (avformat_alloc_output_context2(&outFmt, nullptr, nullptr, outputPath) < 0 || !outFmt) {
379
+ swr_free(&swr);
380
+ avcodec_free_context(&decCtx);
381
+ avformat_close_input(&inFmt);
382
+ return std::string("Failed to allocate output context");
383
+ }
384
+
385
+ const AVCodec* encoder = nullptr;
386
+ if (codec_id == AV_CODEC_ID_MP3) {
387
+ // Force using libshine for MP3 encoding. Do NOT fall back to libmp3lame or
388
+ // internal ffmpeg MP3 encoder to respect licensing choice.
389
+ encoder = avcodec_find_encoder_by_name("libshine");
390
+ if (!encoder) {
391
+ avformat_free_context(outFmt);
392
+ swr_free(&swr);
393
+ avcodec_free_context(&decCtx);
394
+ avformat_close_input(&inFmt);
395
+ return std::string("libshine encoder not available in this build");
396
+ }
397
+ } else {
398
+ encoder = avcodec_find_encoder(codec_id);
399
+ if (!encoder) {
400
+ avformat_free_context(outFmt);
401
+ swr_free(&swr);
402
+ avcodec_free_context(&decCtx);
403
+ avformat_close_input(&inFmt);
404
+ return std::string("Requested encoder not available in this build");
405
+ }
406
+ }
407
+
408
+ AVStream* outStream = avformat_new_stream(outFmt, nullptr);
409
+ if (!outStream) {
410
+ avformat_free_context(outFmt);
411
+ swr_free(&swr);
412
+ avcodec_free_context(&decCtx);
413
+ avformat_close_input(&inFmt);
414
+ return std::string("Failed to create output stream");
415
+ }
416
+
417
+ AVCodecContext* encCtx = avcodec_alloc_context3(encoder);
418
+ // Preserve input sample rate / channel layout by default
419
+ if (!encCtx) {
420
+ avformat_free_context(outFmt);
421
+ swr_free(&swr);
422
+ avcodec_free_context(&decCtx);
423
+ avformat_close_input(&inFmt);
424
+ return std::string("Failed to allocate encoder context");
425
+ }
426
+ // Set channel layout: prefer input stream layout, otherwise decoder layout.
427
+ if (inStream->codecpar->ch_layout.nb_channels) {
428
+ if (av_channel_layout_copy(&encCtx->ch_layout, &inStream->codecpar->ch_layout) < 0) {
429
+ avcodec_free_context(&encCtx);
430
+ avformat_free_context(outFmt);
431
+ swr_free(&swr);
432
+ avcodec_free_context(&decCtx);
433
+ avformat_close_input(&inFmt);
434
+ return std::string("Failed to copy input channel layout to encoder");
435
+ }
436
+ } else {
437
+ if (av_channel_layout_copy(&encCtx->ch_layout, &decCtx->ch_layout) < 0) {
438
+ avcodec_free_context(&encCtx);
439
+ avformat_free_context(outFmt);
440
+ swr_free(&swr);
441
+ avcodec_free_context(&decCtx);
442
+ avformat_close_input(&inFmt);
443
+ return std::string("Failed to set encoder channel layout");
444
+ }
445
+ }
446
+
447
+ // If using libshine (MP3), ensure channel_layout is explicitly set (old encoders expect it)
448
+ if (codec_id == AV_CODEC_ID_MP3) {
449
+ // If encCtx->ch_layout appears empty, set default based on input stream channels
450
+ if (encCtx->ch_layout.nb_channels <= 0) {
451
+ int nb_channels = 1;
452
+ if (inStream->codecpar && inStream->codecpar->ch_layout.nb_channels > 0) {
453
+ nb_channels = inStream->codecpar->ch_layout.nb_channels;
454
+ } else if (decCtx && decCtx->ch_layout.nb_channels > 0) {
455
+ nb_channels = decCtx->ch_layout.nb_channels;
456
+ }
457
+ av_channel_layout_default(&encCtx->ch_layout, nb_channels);
458
+ }
459
+ }
460
+
461
+ // Set sample rate from input/decoder if not already set
462
+ encCtx->sample_rate = inStream->codecpar->sample_rate ? inStream->codecpar->sample_rate : decCtx->sample_rate;
463
+
464
+ // Probe encoder-supported configurations (sample formats, sample rates, channel layouts)
465
+ AVSampleFormat chosen_fmt = AV_SAMPLE_FMT_NONE;
466
+ const void *fmt_configs = nullptr;
467
+ int fmt_num = 0;
468
+ avcodec_get_supported_config(encCtx, encoder, AV_CODEC_CONFIG_SAMPLE_FORMAT, 0, &fmt_configs, &fmt_num);
469
+
470
+ const void *sr_configs = nullptr;
471
+ int sr_num = 0;
472
+ avcodec_get_supported_config(encCtx, encoder, AV_CODEC_CONFIG_SAMPLE_RATE, 0, &sr_configs, &sr_num);
473
+
474
+ const void *chl_configs = nullptr;
475
+ int chl_num = 0;
476
+ avcodec_get_supported_config(encCtx, encoder, AV_CODEC_CONFIG_CHANNEL_LAYOUT, 0, &chl_configs, &chl_num);
477
+
478
+ // Log supported sample formats
479
+ if (fmt_configs && fmt_num > 0) {
480
+ const AVSampleFormat *fmts = (const AVSampleFormat *)fmt_configs;
481
+ for (int i = 0; i < fmt_num; ++i) {
482
+ const char *name = av_get_sample_fmt_name(fmts[i]);
483
+ LOGI("encoder supported fmt[%d]=%s", i, name ? name : "?");
484
+ }
485
+ // prefer interleaved S16, then planar S16P, then decoder fmt, then first
486
+ for (int i = 0; i < fmt_num; ++i) if (fmts[i] == AV_SAMPLE_FMT_S16) { chosen_fmt = AV_SAMPLE_FMT_S16; break; }
487
+ if (chosen_fmt == AV_SAMPLE_FMT_NONE && codec_id == AV_CODEC_ID_MP3) {
488
+ for (int i = 0; i < fmt_num; ++i) if (fmts[i] == AV_SAMPLE_FMT_S16P) { chosen_fmt = AV_SAMPLE_FMT_S16P; break; }
489
+ }
490
+ if (chosen_fmt == AV_SAMPLE_FMT_NONE) {
491
+ for (int i = 0; i < fmt_num; ++i) if (fmts[i] == decCtx->sample_fmt) { chosen_fmt = decCtx->sample_fmt; break; }
492
+ }
493
+ if (chosen_fmt == AV_SAMPLE_FMT_NONE && fmt_num > 0) chosen_fmt = fmts[0];
494
+ } else {
495
+ // libshine only supports S16P; default to S16P for MP3 so open succeeds
496
+ chosen_fmt = (codec_id == AV_CODEC_ID_MP3) ? AV_SAMPLE_FMT_S16P : AV_SAMPLE_FMT_S16;
497
+ }
498
+ encCtx->sample_fmt = chosen_fmt;
499
+
500
+ // If supported sample rates are provided, pick one matching our target or fall back
501
+ if (sr_configs && sr_num > 0) {
502
+ const int *srs = (const int*)sr_configs;
503
+ int pick_sr = 0;
504
+ for (int i = 0; i < sr_num; ++i) {
505
+ LOGI("encoder supported sample_rate[%d]=%d", i, srs[i]);
506
+ if (srs[i] == encCtx->sample_rate) { pick_sr = srs[i]; break; }
507
+ }
508
+ if (pick_sr == 0) pick_sr = srs[0];
509
+ encCtx->sample_rate = pick_sr;
510
+ }
511
+ // libshine only supports 32000, 44100, 48000 Hz. Use outputSampleRateHz if valid (32000/44100/48000), else default 44100.
512
+ if (codec_id == AV_CODEC_ID_MP3) {
513
+ int want = (outputSampleRateHz == 32000 || outputSampleRateHz == 44100 || outputSampleRateHz == 48000) ? outputSampleRateHz : 44100;
514
+ if (encCtx->sample_rate != want) {
515
+ LOGI("libshine: setting sample_rate %d (requested %d)", want, outputSampleRateHz);
516
+ encCtx->sample_rate = want;
517
+ }
518
+ }
519
+
520
+ // If supported channel layouts given, prefer matching channels else pick first
521
+ if (chl_configs && chl_num > 0) {
522
+ const AVChannelLayout *layouts = (const AVChannelLayout *)chl_configs;
523
+ int pick_nb = 0;
524
+ for (int i = 0; i < chl_num; ++i) {
525
+ const AVChannelLayout *l = &layouts[i];
526
+ char buf[128];
527
+ av_channel_layout_describe(l, buf, sizeof(buf));
528
+ LOGI("encoder supported ch_layout[%d]=%s nb_channels=%d", i, buf, l->nb_channels);
529
+ if (l->nb_channels == encCtx->ch_layout.nb_channels) { pick_nb = l->nb_channels; break; }
530
+ }
531
+ if (pick_nb == 0) pick_nb = layouts[0].nb_channels > 0 ? layouts[0].nb_channels : 1;
532
+ if (encCtx->ch_layout.nb_channels != pick_nb) av_channel_layout_default(&encCtx->ch_layout, pick_nb);
533
+ }
534
+
535
+ // libshine reads only AVCodecContext (not options). Use a well-known channel layout so nb_channels is always valid.
536
+ if (codec_id == AV_CODEC_ID_MP3) {
537
+ int want_ch = (encCtx->ch_layout.nb_channels == 2) ? 2 : 1;
538
+ av_channel_layout_uninit(&encCtx->ch_layout);
539
+ if (want_ch == 2) {
540
+ AVChannelLayout stereo = AV_CHANNEL_LAYOUT_STEREO;
541
+ if (av_channel_layout_copy(&encCtx->ch_layout, &stereo) < 0)
542
+ av_channel_layout_default(&encCtx->ch_layout, 2);
543
+ } else {
544
+ AVChannelLayout mono = AV_CHANNEL_LAYOUT_MONO;
545
+ if (av_channel_layout_copy(&encCtx->ch_layout, &mono) < 0)
546
+ av_channel_layout_default(&encCtx->ch_layout, 1);
547
+ }
548
+ }
549
+
550
+ // Set a sensible default bitrate for compressed codecs
551
+ if (codec_id == AV_CODEC_ID_MP3 || codec_id == AV_CODEC_ID_AAC) encCtx->bit_rate = 128000;
552
+ else encCtx->bit_rate = 0; // lossless or PCM may ignore
553
+
554
+ if (outFmt->oformat->flags & AVFMT_GLOBALHEADER) encCtx->flags |= AV_CODEC_FLAG_GLOBAL_HEADER;
555
+
556
+ // Ensure sensible timebase and try opening encoder with options. If it fails, iterate supported sample formats and retry.
557
+ if (encCtx->sample_rate > 0) encCtx->time_base = AVRational{1, encCtx->sample_rate};
558
+
559
+ AVDictionary *enc_opts = nullptr;
560
+ int nb_ch = encCtx->ch_layout.nb_channels;
561
+ if (nb_ch <= 0) nb_ch = 1;
562
+ char tmpbuf[64];
563
+ // For libshine, do not pass options — it uses only AVCodecContext; options can cause "Invalid argument".
564
+ if (codec_id != AV_CODEC_ID_MP3) {
565
+ snprintf(tmpbuf, sizeof(tmpbuf), "%d", nb_ch);
566
+ av_dict_set(&enc_opts, "channels", tmpbuf, 0);
567
+ snprintf(tmpbuf, sizeof(tmpbuf), "%d", encCtx->sample_rate);
568
+ av_dict_set(&enc_opts, "sample_rate", tmpbuf, 0);
569
+ if (encCtx->bit_rate > 0) {
570
+ snprintf(tmpbuf, sizeof(tmpbuf), "%d", (int)encCtx->bit_rate);
571
+ av_dict_set(&enc_opts, "bit_rate", tmpbuf, 0);
572
+ }
573
+ }
574
+
575
+ int ret = avcodec_open2(encCtx, encoder, &enc_opts);
576
+ if (ret < 0) {
577
+ char errbuf[256];
578
+ av_strerror(ret, errbuf, sizeof(errbuf));
579
+ if (enc_opts) { av_dict_free(&enc_opts); enc_opts = nullptr; }
580
+
581
+ // libshine (MP3): we already set S16P, valid rate, mono/stereo; no useful fallback.
582
+ if (codec_id == AV_CODEC_ID_MP3) {
583
+ std::string msg = std::string("Failed to open encoder: ") + errbuf;
584
+ avcodec_free_context(&encCtx);
585
+ avformat_free_context(outFmt);
586
+ swr_free(&swr);
587
+ avcodec_free_context(&decCtx);
588
+ avformat_close_input(&inFmt);
589
+ return msg;
590
+ }
591
+
592
+ LOGW("avcodec_open2 failed for encoder %s: %s. Trying alternatives.", encoder->name, errbuf);
593
+
594
+ // Try each supported sample format (for non-MP3 encoders that may accept multiple formats)
595
+ const AVSampleFormat *fmts = fmt_configs ? (const AVSampleFormat*)fmt_configs : nullptr;
596
+ if (fmts && fmt_num > 0) {
597
+ for (int i = 0; i < fmt_num && ret < 0; ++i) {
598
+ encCtx->sample_fmt = fmts[i];
599
+ AVDictionary *try_opts = nullptr;
600
+ snprintf(tmpbuf, sizeof(tmpbuf), "%d", encCtx->ch_layout.nb_channels > 0 ? encCtx->ch_layout.nb_channels : 1);
601
+ av_dict_set(&try_opts, "channels", tmpbuf, 0);
602
+ snprintf(tmpbuf, sizeof(tmpbuf), "%d", encCtx->sample_rate);
603
+ av_dict_set(&try_opts, "sample_rate", tmpbuf, 0);
604
+ if (encCtx->bit_rate > 0) { snprintf(tmpbuf, sizeof(tmpbuf), "%d", (int)encCtx->bit_rate); av_dict_set(&try_opts, "bit_rate", tmpbuf, 0); }
605
+ const char *sfname = av_get_sample_fmt_name(encCtx->sample_fmt);
606
+ if (sfname) av_dict_set(&try_opts, "sample_fmt", sfname, 0);
607
+ int r = avcodec_open2(encCtx, encoder, &try_opts);
608
+ if (r >= 0) {
609
+ if (try_opts) av_dict_free(&try_opts);
610
+ ret = r;
611
+ break;
612
+ }
613
+ if (try_opts) av_dict_free(&try_opts);
614
+ }
615
+ }
616
+
617
+ // Last resort: try S16 then S16P (for FLAC etc.)
618
+ if (ret < 0) {
619
+ AVSampleFormat fallbacks[] = { AV_SAMPLE_FMT_S16, AV_SAMPLE_FMT_S16P };
620
+ for (int fi = 0; fi < 2 && ret < 0; ++fi) {
621
+ encCtx->sample_fmt = fallbacks[fi];
622
+ AVDictionary *try_opts = nullptr;
623
+ snprintf(tmpbuf, sizeof(tmpbuf), "%d", encCtx->ch_layout.nb_channels > 0 ? encCtx->ch_layout.nb_channels : 1);
624
+ av_dict_set(&try_opts, "channels", tmpbuf, 0);
625
+ snprintf(tmpbuf, sizeof(tmpbuf), "%d", encCtx->sample_rate);
626
+ av_dict_set(&try_opts, "sample_rate", tmpbuf, 0);
627
+ if (encCtx->bit_rate > 0) { snprintf(tmpbuf, sizeof(tmpbuf), "%d", (int)encCtx->bit_rate); av_dict_set(&try_opts, "bit_rate", tmpbuf, 0); }
628
+ const char *sfname = av_get_sample_fmt_name(encCtx->sample_fmt);
629
+ if (sfname) av_dict_set(&try_opts, "sample_fmt", sfname, 0);
630
+ int r = avcodec_open2(encCtx, encoder, &try_opts);
631
+ if (r >= 0) {
632
+ if (try_opts) av_dict_free(&try_opts);
633
+ ret = r;
634
+ break;
635
+ }
636
+ if (try_opts) av_dict_free(&try_opts);
637
+ }
638
+ }
639
+
640
+ if (ret < 0) {
641
+ char eb[256]; av_strerror(ret, eb, sizeof(eb));
642
+ std::string msg = std::string("Failed to open encoder: ") + eb;
643
+ avcodec_free_context(&encCtx);
644
+ avformat_free_context(outFmt);
645
+ swr_free(&swr);
646
+ avcodec_free_context(&decCtx);
647
+ avformat_close_input(&inFmt);
648
+ return msg;
649
+ }
650
+ }
651
+
652
+ if (avcodec_parameters_from_context(outStream->codecpar, encCtx) < 0) {
653
+ avcodec_free_context(&encCtx);
654
+ avformat_free_context(outFmt);
655
+ swr_free(&swr);
656
+ avcodec_free_context(&decCtx);
657
+ avformat_close_input(&inFmt);
658
+ return std::string("Failed to set output stream parameters");
659
+ }
660
+
661
+ if (!(outFmt->oformat->flags & AVFMT_NOFILE)) {
662
+ if (avio_open(&outFmt->pb, outputPath, AVIO_FLAG_WRITE) < 0) {
663
+ avcodec_free_context(&encCtx);
664
+ avformat_free_context(outFmt);
665
+ swr_free(&swr);
666
+ avcodec_free_context(&decCtx);
667
+ avformat_close_input(&inFmt);
668
+ return std::string("Failed to open output file for writing");
669
+ }
670
+ }
671
+
672
+ if (avformat_write_header(outFmt, nullptr) < 0) {
673
+ if (!(outFmt->oformat->flags & AVFMT_NOFILE)) avio_closep(&outFmt->pb);
674
+ avcodec_free_context(&encCtx);
675
+ avformat_free_context(outFmt);
676
+ swr_free(&swr);
677
+ avcodec_free_context(&decCtx);
678
+ avformat_close_input(&inFmt);
679
+ return std::string("Failed to write output header");
680
+ }
681
+
682
+ AVPacket* pkt = av_packet_alloc();
683
+ AVFrame* frame = av_frame_alloc();
684
+ AVFrame* resampled = av_frame_alloc();
685
+ // Match encoder format/rate
686
+ resampled->format = encCtx->sample_fmt;
687
+ resampled->sample_rate = encCtx->sample_rate;
688
+ // ensure resampled frame has encoder channel layout
689
+ if (av_channel_layout_copy(&resampled->ch_layout, &encCtx->ch_layout) < 0) {
690
+ av_frame_free(&frame);
691
+ av_frame_free(&resampled);
692
+ av_packet_free(&pkt);
693
+ avcodec_free_context(&encCtx);
694
+ avformat_free_context(outFmt);
695
+ avcodec_free_context(&decCtx);
696
+ avformat_close_input(&inFmt);
697
+ return std::string("Failed to set resampled channel layout");
698
+ }
699
+
700
+ // Initialize resampler to convert from decoder format -> chosen encoder format
701
+ AVChannelLayout in_ch_layout2{};
702
+ if (inStream->codecpar->ch_layout.nb_channels) {
703
+ if (av_channel_layout_copy(&in_ch_layout2, &inStream->codecpar->ch_layout) < 0) {
704
+ av_channel_layout_uninit(&resampled->ch_layout);
705
+ av_frame_free(&frame);
706
+ av_frame_free(&resampled);
707
+ av_packet_free(&pkt);
708
+ avcodec_free_context(&encCtx);
709
+ avformat_free_context(outFmt);
710
+ swr_free(&swr);
711
+ avcodec_free_context(&decCtx);
712
+ avformat_close_input(&inFmt);
713
+ return std::string("Failed to copy input channel layout");
714
+ }
715
+ } else {
716
+ if (av_channel_layout_copy(&in_ch_layout2, &decCtx->ch_layout) < 0) {
717
+ av_channel_layout_uninit(&resampled->ch_layout);
718
+ av_frame_free(&frame);
719
+ av_frame_free(&resampled);
720
+ av_packet_free(&pkt);
721
+ avcodec_free_context(&encCtx);
722
+ avformat_free_context(outFmt);
723
+ swr_free(&swr);
724
+ avcodec_free_context(&decCtx);
725
+ avformat_close_input(&inFmt);
726
+ return std::string("Failed to init input channel layout");
727
+ }
728
+ }
729
+ if (swr_alloc_set_opts2(&swr,
730
+ &encCtx->ch_layout, encCtx->sample_fmt, encCtx->sample_rate,
731
+ &in_ch_layout2, (AVSampleFormat)decCtx->sample_fmt, decCtx->sample_rate,
732
+ 0, nullptr) < 0 || !swr) {
733
+ av_channel_layout_uninit(&in_ch_layout2);
734
+ if (swr) swr_free(&swr);
735
+ av_channel_layout_uninit(&resampled->ch_layout);
736
+ av_frame_free(&frame);
737
+ av_frame_free(&resampled);
738
+ av_packet_free(&pkt);
739
+ avcodec_free_context(&encCtx);
740
+ avformat_free_context(outFmt);
741
+ avcodec_free_context(&decCtx);
742
+ avformat_close_input(&inFmt);
743
+ return std::string("Failed to initialize resampler");
744
+ }
745
+ av_channel_layout_uninit(&in_ch_layout2);
746
+
747
+ while (av_read_frame(inFmt, pkt) >= 0) {
748
+ if (pkt->stream_index == audioStreamIndex) {
749
+ if (avcodec_send_packet(decCtx, pkt) == 0) {
750
+ while (avcodec_receive_frame(decCtx, frame) == 0) {
751
+ int in_sr2 = inStream->codecpar->sample_rate ? inStream->codecpar->sample_rate : decCtx->sample_rate;
752
+ int64_t out_nb_samples = av_rescale_rnd(swr_get_delay(swr, in_sr2) + frame->nb_samples, encCtx->sample_rate, in_sr2, AV_ROUND_UP);
753
+ uint8_t** outData = nullptr;
754
+ int out_ch2 = encCtx->ch_layout.nb_channels;
755
+ if (out_ch2 <= 0) out_ch2 = 1;
756
+ if (av_samples_alloc_array_and_samples(&outData, nullptr, out_ch2, (int)out_nb_samples, encCtx->sample_fmt, 0) < 0) {
757
+ av_packet_unref(pkt);
758
+ continue;
759
+ }
760
+ int converted = swr_convert(swr, outData, (int)out_nb_samples, (const uint8_t**)frame->data, frame->nb_samples);
761
+ if (converted < 0) {
762
+ av_freep(&outData[0]);
763
+ av_freep(&outData);
764
+ continue;
765
+ }
766
+
767
+ resampled->nb_samples = converted;
768
+ if (av_frame_get_buffer(resampled, 0) < 0) {
769
+ av_freep(&outData[0]);
770
+ av_freep(&outData);
771
+ continue;
772
+ }
773
+ int bytes_per_sample = av_get_bytes_per_sample((AVSampleFormat)resampled->format);
774
+ int copy_size2 = converted * bytes_per_sample * out_ch2;
775
+ memcpy(resampled->data[0], outData[0], copy_size2);
776
+
777
+ if (avcodec_send_frame(encCtx, resampled) == 0) {
778
+ AVPacket* outPkt = av_packet_alloc();
779
+ while (avcodec_receive_packet(encCtx, outPkt) == 0) {
780
+ outPkt->stream_index = outStream->index;
781
+ av_packet_rescale_ts(outPkt, encCtx->time_base, outStream->time_base);
782
+ av_interleaved_write_frame(outFmt, outPkt);
783
+ av_packet_unref(outPkt);
784
+ }
785
+ av_packet_free(&outPkt);
786
+ }
787
+
788
+ av_freep(&outData[0]);
789
+ av_freep(&outData);
790
+ av_frame_unref(resampled);
791
+ av_frame_unref(frame);
792
+ }
793
+ }
794
+ }
795
+ av_packet_unref(pkt);
796
+ }
797
+
798
+ // Flush encoder
799
+ avcodec_send_frame(encCtx, nullptr);
800
+ AVPacket* outPkt2 = av_packet_alloc();
801
+ while (avcodec_receive_packet(encCtx, outPkt2) == 0) {
802
+ outPkt2->stream_index = outStream->index;
803
+ av_packet_rescale_ts(outPkt2, encCtx->time_base, outStream->time_base);
804
+ av_interleaved_write_frame(outFmt, outPkt2);
805
+ av_packet_unref(outPkt2);
806
+ }
807
+ av_packet_free(&outPkt2);
808
+
809
+ av_write_trailer(outFmt);
810
+ if (!(outFmt->oformat->flags & AVFMT_NOFILE)) avio_closep(&outFmt->pb);
811
+
812
+ av_packet_free(&pkt);
813
+ av_frame_free(&frame);
814
+ av_channel_layout_uninit(&resampled->ch_layout);
815
+ av_frame_free(&resampled);
816
+
817
+ swr_free(&swr);
818
+ avcodec_free_context(&encCtx);
819
+ avformat_free_context(outFmt);
820
+ avcodec_free_context(&decCtx);
821
+ avformat_close_input(&inFmt);
822
+
823
+ return std::string("");
824
+ #else
825
+ (void)inputPath; (void)outputPath; (void)formatHint;
826
+ return std::string("FFmpeg not available. Build prebuilts with third_party/ffmpeg_prebuilt/build_ffmpeg.ps1 or build_ffmpeg.sh.");
827
+ #endif
828
+ }
829
+
830
+ extern "C" {
831
+
832
+ // Called from Kotlin: SherpaOnnxModule.nativeConvertAudioToWav16k(inputPath, outputPath) -> Boolean
833
+ // or from a dedicated helper that returns an error string. We use a single JNI that returns a boolean
834
+ // and optionally pass back an error message via a separate call or out parameter.
835
+ // For simplicity we expose one method that returns a jstring: empty = success, non-empty = error message.
836
+ JNIEXPORT jstring JNICALL
837
+ Java_com_sherpaonnx_SherpaOnnxModule_nativeConvertAudioToWav16k(
838
+ JNIEnv* env,
839
+ jobject /* this */,
840
+ jstring inputPath,
841
+ jstring outputPath) {
842
+ if (inputPath == nullptr || outputPath == nullptr) {
843
+ return env->NewStringUTF("inputPath and outputPath must be non-null");
844
+ }
845
+ const char* input = env->GetStringUTFChars(inputPath, nullptr);
846
+ const char* output = env->GetStringUTFChars(outputPath, nullptr);
847
+ if (input == nullptr || output == nullptr) {
848
+ if (input) env->ReleaseStringUTFChars(inputPath, input);
849
+ if (output) env->ReleaseStringUTFChars(outputPath, output);
850
+ return env->NewStringUTF("Failed to get path strings");
851
+ }
852
+ std::string err = convertToWav16kMono(input, output);
853
+ env->ReleaseStringUTFChars(inputPath, input);
854
+ env->ReleaseStringUTFChars(outputPath, output);
855
+ return env->NewStringUTF(err.c_str());
856
+ }
857
+
858
+ JNIEXPORT jstring JNICALL
859
+ Java_com_sherpaonnx_SherpaOnnxModule_nativeConvertAudioToFormat(
860
+ JNIEnv* env,
861
+ jobject /* this */,
862
+ jstring inputPath,
863
+ jstring outputPath,
864
+ jstring formatHint,
865
+ jint outputSampleRateHz) {
866
+ if (inputPath == nullptr || outputPath == nullptr || formatHint == nullptr) {
867
+ return env->NewStringUTF("inputPath, outputPath and formatHint must be non-null");
868
+ }
869
+ const char* input = env->GetStringUTFChars(inputPath, nullptr);
870
+ const char* output = env->GetStringUTFChars(outputPath, nullptr);
871
+ const char* fmt = env->GetStringUTFChars(formatHint, nullptr);
872
+ if (input == nullptr || output == nullptr || fmt == nullptr) {
873
+ if (input) env->ReleaseStringUTFChars(inputPath, input);
874
+ if (output) env->ReleaseStringUTFChars(outputPath, output);
875
+ if (fmt) env->ReleaseStringUTFChars(formatHint, fmt);
876
+ return env->NewStringUTF("Failed to get path/format strings");
877
+ }
878
+
879
+ std::string err = convertToFormat(input, output, fmt, (int)outputSampleRateHz);
880
+
881
+ env->ReleaseStringUTFChars(inputPath, input);
882
+ env->ReleaseStringUTFChars(outputPath, output);
883
+ env->ReleaseStringUTFChars(formatHint, fmt);
884
+
885
+ return env->NewStringUTF(err.c_str());
886
+ }
887
+
888
+ } // extern "C"