react-native-sherpa-onnx 0.3.2 → 0.3.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (83) hide show
  1. package/README.md +84 -77
  2. package/SherpaOnnx.podspec +79 -45
  3. package/android/build.gradle +8 -2
  4. package/android/prebuilt-download.gradle +70 -16
  5. package/android/prebuilt-versions.gradle +14 -6
  6. package/android/src/main/cpp/CMakeLists.txt +2 -0
  7. package/android/src/main/cpp/jni/audio/sherpa-onnx-audio-convert-jni.cpp +202 -328
  8. package/android/src/main/cpp/jni/model_detect/sherpa-onnx-detect-jni-common.cpp +22 -0
  9. package/android/src/main/cpp/jni/model_detect/sherpa-onnx-detect-jni-common.h +2 -0
  10. package/android/src/main/cpp/jni/model_detect/sherpa-onnx-model-detect-helper.cpp +96 -142
  11. package/android/src/main/cpp/jni/model_detect/sherpa-onnx-model-detect-helper.h +40 -4
  12. package/android/src/main/cpp/jni/model_detect/sherpa-onnx-model-detect-stt.cpp +774 -316
  13. package/android/src/main/cpp/jni/model_detect/sherpa-onnx-model-detect-tts.cpp +208 -122
  14. package/android/src/main/cpp/jni/model_detect/sherpa-onnx-model-detect.h +92 -0
  15. package/android/src/main/cpp/jni/model_detect/sherpa-onnx-stt-wrapper.cpp +3 -0
  16. package/android/src/main/cpp/jni/model_detect/sherpa-onnx-tts-wrapper.cpp +14 -2
  17. package/android/src/main/cpp/jni/model_detect/sherpa-onnx-validate-stt.cpp +229 -0
  18. package/android/src/main/cpp/jni/model_detect/sherpa-onnx-validate-stt.h +38 -0
  19. package/android/src/main/cpp/jni/model_detect/sherpa-onnx-validate-tts.cpp +144 -0
  20. package/android/src/main/cpp/jni/model_detect/sherpa-onnx-validate-tts.h +38 -0
  21. package/android/src/main/cpp/jni/module/sherpa-onnx-module-jni.cpp +1 -1
  22. package/android/src/main/java/com/sherpaonnx/SherpaOnnxModule.kt +157 -11
  23. package/android/src/main/java/com/sherpaonnx/SherpaOnnxPcmCapture.kt +150 -0
  24. package/android/src/main/java/com/sherpaonnx/SherpaOnnxSttHelper.kt +75 -24
  25. package/android/src/main/java/com/sherpaonnx/SherpaOnnxTtsHelper.kt +52 -1
  26. package/ios/SherpaOnnx+PcmLiveStream.mm +288 -0
  27. package/ios/SherpaOnnx+STT.mm +2 -0
  28. package/ios/SherpaOnnx+TTS.mm +17 -0
  29. package/ios/SherpaOnnx.mm +27 -3
  30. package/ios/SherpaOnnxAudioConvert.h +28 -0
  31. package/ios/SherpaOnnxAudioConvert.mm +698 -0
  32. package/ios/archive/sherpa-onnx-archive-helper.mm +12 -0
  33. package/ios/model_detect/sherpa-onnx-model-detect-helper.h +37 -3
  34. package/ios/model_detect/sherpa-onnx-model-detect-helper.mm +80 -45
  35. package/ios/model_detect/sherpa-onnx-model-detect-stt.mm +629 -267
  36. package/ios/model_detect/sherpa-onnx-model-detect-tts.mm +148 -56
  37. package/ios/model_detect/sherpa-onnx-model-detect.h +72 -0
  38. package/ios/model_detect/sherpa-onnx-validate-stt.h +38 -0
  39. package/ios/model_detect/sherpa-onnx-validate-stt.mm +229 -0
  40. package/ios/model_detect/sherpa-onnx-validate-tts.h +38 -0
  41. package/ios/model_detect/sherpa-onnx-validate-tts.mm +144 -0
  42. package/ios/stt/sherpa-onnx-stt-wrapper.mm +4 -0
  43. package/lib/module/NativeSherpaOnnx.js.map +1 -1
  44. package/lib/module/audio/index.js +55 -1
  45. package/lib/module/audio/index.js.map +1 -1
  46. package/lib/module/download/ModelDownloadManager.js +14 -0
  47. package/lib/module/download/ModelDownloadManager.js.map +1 -1
  48. package/lib/module/index.js +10 -0
  49. package/lib/module/index.js.map +1 -1
  50. package/lib/module/stt/streaming.js +6 -3
  51. package/lib/module/stt/streaming.js.map +1 -1
  52. package/lib/module/tts/index.js +13 -1
  53. package/lib/module/tts/index.js.map +1 -1
  54. package/lib/typescript/src/NativeSherpaOnnx.d.ts +32 -3
  55. package/lib/typescript/src/NativeSherpaOnnx.d.ts.map +1 -1
  56. package/lib/typescript/src/audio/index.d.ts +20 -1
  57. package/lib/typescript/src/audio/index.d.ts.map +1 -1
  58. package/lib/typescript/src/download/ModelDownloadManager.d.ts +2 -1
  59. package/lib/typescript/src/download/ModelDownloadManager.d.ts.map +1 -1
  60. package/lib/typescript/src/index.d.ts +10 -0
  61. package/lib/typescript/src/index.d.ts.map +1 -1
  62. package/lib/typescript/src/stt/streaming.d.ts.map +1 -1
  63. package/lib/typescript/src/stt/streamingTypes.d.ts +1 -1
  64. package/lib/typescript/src/stt/streamingTypes.d.ts.map +1 -1
  65. package/lib/typescript/src/tts/index.d.ts +12 -1
  66. package/lib/typescript/src/tts/index.d.ts.map +1 -1
  67. package/package.json +6 -1
  68. package/scripts/check-model-csvs.sh +72 -0
  69. package/scripts/setup-ios-framework.sh +272 -191
  70. package/src/NativeSherpaOnnx.ts +37 -3
  71. package/src/audio/index.ts +84 -1
  72. package/src/download/ModelDownloadManager.ts +19 -0
  73. package/src/index.tsx +15 -0
  74. package/src/stt/streaming.ts +10 -5
  75. package/src/stt/streamingTypes.ts +1 -1
  76. package/src/tts/index.ts +25 -1
  77. package/third_party/ffmpeg_prebuilt/ANDROID_RELEASE_TAG +1 -1
  78. package/third_party/libarchive_prebuilt/ANDROID_RELEASE_TAG +1 -1
  79. package/third_party/libarchive_prebuilt/IOS_RELEASE_TAG +1 -1
  80. package/third_party/sherpa-onnx-prebuilt/ANDROID_RELEASE_TAG +1 -1
  81. package/third_party/sherpa-onnx-prebuilt/IOS_RELEASE_TAG +1 -1
  82. package/ios/scripts/patch-libarchive-includes.sh +0 -61
  83. package/ios/scripts/setup-ios-libarchive.sh +0 -98
@@ -8,6 +8,7 @@
8
8
  #include <android/log.h>
9
9
  #include <jni.h>
10
10
  #include <string>
11
+ #include <sys/stat.h>
11
12
 
12
13
  #define LOG_TAG "AudioConvertJNI"
13
14
  #define LOGI(...) __android_log_print(ANDROID_LOG_INFO, LOG_TAG, __VA_ARGS__)
@@ -19,316 +20,48 @@ extern "C" {
19
20
  #include <libavcodec/avcodec.h>
20
21
  #include <libavformat/avformat.h>
21
22
  #include <libavutil/opt.h>
23
+ #include <libavutil/error.h>
22
24
  #include <libswresample/swresample.h>
23
25
  }
24
26
  #include <cstdio>
25
27
  #include <vector>
26
28
  #endif
27
29
 
28
- // Returns empty string on success, or error message on failure.
29
- // Output is always 16 kHz mono 16-bit PCM (sherpa-onnx requirement). Input can be any rate; we resample to 16k.
30
- static std::string convertToWav16kMono(const char* inputPath, const char* outputPath) {
31
- #ifdef HAVE_FFMPEG
32
- // Implement a basic decode -> resample -> write WAV pipeline using libav* APIs.
33
- av_log_set_level(AV_LOG_ERROR);
34
-
35
- AVFormatContext* inFmt = nullptr;
36
- if (avformat_open_input(&inFmt, inputPath, nullptr, nullptr) < 0) {
37
- return std::string("Failed to open input file");
38
- }
39
- if (avformat_find_stream_info(inFmt, nullptr) < 0) {
40
- avformat_close_input(&inFmt);
41
- return std::string("Failed to find stream info");
42
- }
43
-
44
- int audioStreamIndex = -1;
45
- for (unsigned i = 0; i < inFmt->nb_streams; ++i) {
46
- if (inFmt->streams[i]->codecpar->codec_type == AVMEDIA_TYPE_AUDIO) {
47
- audioStreamIndex = i;
48
- break;
49
- }
50
- }
51
- if (audioStreamIndex < 0) {
52
- avformat_close_input(&inFmt);
53
- return std::string("No audio stream found in input");
54
- }
55
-
56
- AVStream* inStream = inFmt->streams[audioStreamIndex];
57
- const AVCodec* decoder = avcodec_find_decoder(inStream->codecpar->codec_id);
58
- if (!decoder) {
59
- avformat_close_input(&inFmt);
60
- return std::string("Unsupported input codec");
61
- }
62
-
63
- AVCodecContext* decCtx = avcodec_alloc_context3(decoder);
64
- if (!decCtx) {
65
- avformat_close_input(&inFmt);
66
- return std::string("Failed to allocate decoder context");
67
- }
68
- if (avcodec_parameters_to_context(decCtx, inStream->codecpar) < 0) {
69
- avcodec_free_context(&decCtx);
70
- avformat_close_input(&inFmt);
71
- return std::string("Failed to copy codec parameters");
72
- }
73
- if (avcodec_open2(decCtx, decoder, nullptr) < 0) {
74
- avcodec_free_context(&decCtx);
75
- avformat_close_input(&inFmt);
76
- return std::string("Failed to open decoder");
77
- }
78
-
79
- // Prepare resampler to 16k mono s16 using AVChannelLayout helpers
80
- SwrContext* swr = nullptr;
81
- AVChannelLayout out_ch_layout = AV_CHANNEL_LAYOUT_MONO;
82
- AVChannelLayout in_ch_layout;
83
- // Prefer codecpar ch_layout when available, otherwise fall back to decoder ctx
84
- if (inStream->codecpar->ch_layout.nb_channels) {
85
- if (av_channel_layout_copy(&in_ch_layout, &inStream->codecpar->ch_layout) < 0) {
86
- avcodec_free_context(&decCtx);
87
- avformat_close_input(&inFmt);
88
- return std::string("Failed to copy input channel layout");
89
- }
90
- } else {
91
- if (av_channel_layout_copy(&in_ch_layout, &decCtx->ch_layout) < 0) {
92
- avcodec_free_context(&decCtx);
93
- avformat_close_input(&inFmt);
94
- return std::string("Failed to initialize input channel layout");
95
- }
96
- }
97
- if (swr_alloc_set_opts2(&swr,
98
- &out_ch_layout, AV_SAMPLE_FMT_S16, 16000,
99
- &in_ch_layout, (AVSampleFormat)decCtx->sample_fmt, decCtx->sample_rate,
100
- 0, nullptr) < 0 || !swr) {
101
- av_channel_layout_uninit(&in_ch_layout);
102
- if (swr) swr_free(&swr);
103
- avcodec_free_context(&decCtx);
104
- avformat_close_input(&inFmt);
105
- return std::string("Failed to initialize resampler");
106
- }
107
- av_channel_layout_uninit(&in_ch_layout);
108
-
109
- // Prepare output WAV via avformat
110
- AVFormatContext* outFmt = nullptr;
111
- if (avformat_alloc_output_context2(&outFmt, nullptr, nullptr, outputPath) < 0 || !outFmt) {
112
- swr_free(&swr);
113
- avcodec_free_context(&decCtx);
114
- avformat_close_input(&inFmt);
115
- return std::string("Failed to allocate output context");
116
- }
117
-
118
- const AVCodec* pcmCodec = avcodec_find_encoder(AV_CODEC_ID_PCM_S16LE);
119
- if (!pcmCodec) {
120
- avformat_free_context(outFmt);
121
- swr_free(&swr);
122
- avcodec_free_context(&decCtx);
123
- avformat_close_input(&inFmt);
124
- return std::string("PCM encoder not found");
125
- }
126
-
127
- AVStream* outStream = avformat_new_stream(outFmt, nullptr);
128
- if (!outStream) {
129
- avformat_free_context(outFmt);
130
- swr_free(&swr);
131
- avcodec_free_context(&decCtx);
132
- avformat_close_input(&inFmt);
133
- return std::string("Failed to create output stream");
134
- }
135
-
136
- AVCodecContext* encCtx = avcodec_alloc_context3(pcmCodec);
137
- // Configure encoder context for mono 16k s16 output
138
- AVChannelLayout mono_layout = AV_CHANNEL_LAYOUT_MONO;
139
- if (!encCtx) {
140
- avformat_free_context(outFmt);
141
- swr_free(&swr);
142
- avcodec_free_context(&decCtx);
143
- avformat_close_input(&inFmt);
144
- return std::string("Failed to allocate encoder context");
145
- }
146
- if (av_channel_layout_copy(&encCtx->ch_layout, &mono_layout) < 0) {
147
- avcodec_free_context(&encCtx);
148
- avformat_free_context(outFmt);
149
- swr_free(&swr);
150
- avcodec_free_context(&decCtx);
151
- avformat_close_input(&inFmt);
152
- return std::string("Failed to set encoder channel layout");
153
- }
154
- encCtx->sample_rate = 16000;
155
- encCtx->sample_fmt = AV_SAMPLE_FMT_S16;
156
- encCtx->bit_rate = 16 * 16000; // rough
157
-
158
- if (outFmt->oformat->flags & AVFMT_GLOBALHEADER) encCtx->flags |= AV_CODEC_FLAG_GLOBAL_HEADER;
159
-
160
- if (avcodec_open2(encCtx, pcmCodec, nullptr) < 0) {
161
- avcodec_free_context(&encCtx);
162
- avformat_free_context(outFmt);
163
- swr_free(&swr);
164
- avcodec_free_context(&decCtx);
165
- avformat_close_input(&inFmt);
166
- return std::string("Failed to open PCM encoder");
167
- }
168
-
169
- if (avcodec_parameters_from_context(outStream->codecpar, encCtx) < 0) {
170
- avcodec_free_context(&encCtx);
171
- avformat_free_context(outFmt);
172
- swr_free(&swr);
173
- avcodec_free_context(&decCtx);
174
- avformat_close_input(&inFmt);
175
- return std::string("Failed to set output stream parameters");
176
- }
177
-
178
- if (!(outFmt->oformat->flags & AVFMT_NOFILE)) {
179
- if (avio_open(&outFmt->pb, outputPath, AVIO_FLAG_WRITE) < 0) {
180
- avcodec_free_context(&encCtx);
181
- avformat_free_context(outFmt);
182
- swr_free(&swr);
183
- avcodec_free_context(&decCtx);
184
- avformat_close_input(&inFmt);
185
- return std::string("Failed to open output file for writing");
186
- }
187
- }
188
-
189
- if (avformat_write_header(outFmt, nullptr) < 0) {
190
- if (!(outFmt->oformat->flags & AVFMT_NOFILE)) avio_closep(&outFmt->pb);
191
- avcodec_free_context(&encCtx);
192
- avformat_free_context(outFmt);
193
- swr_free(&swr);
194
- avcodec_free_context(&decCtx);
195
- avformat_close_input(&inFmt);
196
- return std::string("Failed to write output header");
197
- }
198
-
199
- AVPacket* pkt = av_packet_alloc();
200
- AVFrame* frame = av_frame_alloc();
201
- AVFrame* resampled = av_frame_alloc();
202
- // Configure resampled frame metadata
203
- resampled->format = AV_SAMPLE_FMT_S16;
204
- resampled->sample_rate = 16000;
205
- // set channel layout on frame
206
- AVChannelLayout out_ch_layout_local = AV_CHANNEL_LAYOUT_MONO;
207
- if (av_channel_layout_copy(&resampled->ch_layout, &out_ch_layout_local) < 0) {
208
- av_frame_free(&frame);
209
- av_frame_free(&resampled);
210
- swr_free(&swr);
211
- avcodec_free_context(&decCtx);
212
- avformat_close_input(&inFmt);
213
- return std::string("Failed to set resampled frame channel layout");
214
- }
215
-
216
- // Buffer for resampled data will be allocated per needed samples
217
-
218
- while (av_read_frame(inFmt, pkt) >= 0) {
219
- if (pkt->stream_index == audioStreamIndex) {
220
- if (avcodec_send_packet(decCtx, pkt) == 0) {
221
- while (avcodec_receive_frame(decCtx, frame) == 0) {
222
- // Resample
223
- int in_sr = inStream->codecpar->sample_rate ? inStream->codecpar->sample_rate : decCtx->sample_rate;
224
- int64_t out_nb_samples = av_rescale_rnd(swr_get_delay(swr, in_sr) + frame->nb_samples, 16000, in_sr, AV_ROUND_UP);
225
- uint8_t** outData = nullptr;
226
- int out_channels = resampled->ch_layout.nb_channels;
227
- if (out_channels <= 0) out_channels = 1;
228
- if (av_samples_alloc_array_and_samples(&outData, nullptr, out_channels, (int)out_nb_samples, AV_SAMPLE_FMT_S16, 0) < 0) {
229
- av_packet_unref(pkt);
230
- continue;
231
- }
232
- int converted = swr_convert(swr, outData, (int)out_nb_samples, (const uint8_t**)frame->data, frame->nb_samples);
233
- if (converted < 0) {
234
- av_freep(&outData[0]);
235
- av_freep(&outData);
236
- continue;
237
- }
238
-
239
- // prepare frame for encoder
240
- resampled->nb_samples = converted;
241
- if (av_frame_get_buffer(resampled, 0) < 0) {
242
- av_freep(&outData[0]);
243
- av_freep(&outData);
244
- continue;
245
- }
246
- // copy data into resampled frame
247
- int bytes_per_sample = av_get_bytes_per_sample((AVSampleFormat)resampled->format);
248
- int copy_size = converted * bytes_per_sample * out_channels;
249
- memcpy(resampled->data[0], outData[0], copy_size);
250
-
251
- // send to encoder
252
- if (avcodec_send_frame(encCtx, resampled) == 0) {
253
- AVPacket* outPkt = av_packet_alloc();
254
- while (avcodec_receive_packet(encCtx, outPkt) == 0) {
255
- outPkt->stream_index = outStream->index;
256
- av_packet_rescale_ts(outPkt, encCtx->time_base, outStream->time_base);
257
- av_interleaved_write_frame(outFmt, outPkt);
258
- av_packet_unref(outPkt);
259
- }
260
- av_packet_free(&outPkt);
261
- }
262
-
263
- av_freep(&outData[0]);
264
- av_freep(&outData);
265
- av_frame_unref(resampled);
266
- av_frame_unref(frame);
267
- }
268
- }
269
- }
270
- av_packet_unref(pkt);
271
- }
272
-
273
- // Flush encoder
274
- avcodec_send_frame(encCtx, nullptr);
275
- AVPacket* outPkt = av_packet_alloc();
276
- while (avcodec_receive_packet(encCtx, outPkt) == 0) {
277
- outPkt->stream_index = outStream->index;
278
- av_packet_rescale_ts(outPkt, encCtx->time_base, outStream->time_base);
279
- av_interleaved_write_frame(outFmt, outPkt);
280
- av_packet_unref(outPkt);
281
- }
282
- av_packet_free(&outPkt);
283
-
284
- av_write_trailer(outFmt);
285
- if (!(outFmt->oformat->flags & AVFMT_NOFILE)) avio_closep(&outFmt->pb);
30
+ // Forward declaration convertToFormat handles all formats including WAV (16 kHz mono).
31
+ static std::string convertToFormat(const char* inputPath, const char* outputPath, const char* formatHint, int outputSampleRateHz);
286
32
 
287
- av_packet_free(&pkt);
288
- av_frame_free(&frame);
289
- av_channel_layout_uninit(&resampled->ch_layout);
290
- av_frame_free(&resampled);
291
-
292
- swr_free(&swr);
293
- avcodec_free_context(&encCtx);
294
- avformat_free_context(outFmt);
295
- avcodec_free_context(&decCtx);
296
- avformat_close_input(&inFmt);
297
-
298
- return std::string("");
299
- #else
300
- (void)inputPath;
301
- (void)outputPath;
302
- return "FFmpeg not available. Build prebuilts with third_party/ffmpeg_prebuilt/build_ffmpeg.ps1 or build_ffmpeg.sh.";
303
- #endif
33
+ // Convenience: convert any audio to 16 kHz mono WAV via the main convertToFormat pipeline.
34
+ static std::string convertToWav16kMono(const char* inputPath, const char* outputPath) {
35
+ return convertToFormat(inputPath, outputPath, "wav", 16000);
304
36
  }
305
37
 
306
38
  // Generic conversion: supports writing WAV/MP3/FLAC depending on output file extension and linked encoders.
307
- // WAV path always uses convertToWav16kMono (16 kHz mono out for sherpa-onnx). outputSampleRateHz is only used for MP3 (libshine: 32000/44100/48000); 0 = default 44100.
39
+ // WAV output is 16 kHz mono PCM (sherpa-onnx). outputSampleRateHz is only used for MP3 (libshine: 32000/44100/48000); 0 = default 44100.
308
40
  static std::string convertToFormat(const char* inputPath, const char* outputPath, const char* formatHint, int outputSampleRateHz) {
309
41
  #ifdef HAVE_FFMPEG
310
- // WAV output is always 16 kHz mono via convertToWav16kMono (sherpa-onnx). Input WAV at 16k is resampled 16k->16k (no change).
311
42
  std::string fmt(formatHint ? formatHint : "");
312
- if (fmt == "wav" || fmt == "wav16k") {
313
- return convertToWav16kMono(inputPath, outputPath);
314
- }
43
+ bool isWav = (fmt == "wav" || fmt == "wav16k" || fmt.empty());
315
44
 
316
- // Try to determine codec id from format hint
317
45
  AVCodecID codec_id = AV_CODEC_ID_NONE;
318
- if (fmt == "mp3") codec_id = AV_CODEC_ID_MP3;
46
+ if (isWav) codec_id = AV_CODEC_ID_PCM_S16LE;
47
+ else if (fmt == "mp3") codec_id = AV_CODEC_ID_MP3;
319
48
  else if (fmt == "flac") codec_id = AV_CODEC_ID_FLAC;
320
- else {
321
- // fallback to WAV
322
- return convertToWav16kMono(inputPath, outputPath);
323
- }
49
+ else if (fmt == "m4a" || fmt == "aac") codec_id = AV_CODEC_ID_AAC;
50
+ else if (fmt == "opus" || fmt == "oggm" || fmt == "ogg" || fmt == "webm" || fmt == "mkv") codec_id = AV_CODEC_ID_OPUS;
51
+ else codec_id = AV_CODEC_ID_PCM_S16LE;
324
52
 
325
53
  // The implementation for generic encoding uses the same decode+resample pipeline
326
54
  // but selects encoder by codec_id and creates an output container based on file extension.
327
55
  // For brevity we reuse much of the WAV path but change encoder selection.
328
56
 
57
+ struct stat stIn = {};
58
+ long inputSizeBytes = (stat(inputPath, &stIn) == 0 && S_ISREG(stIn.st_mode)) ? (long)stIn.st_size : -1;
59
+ LOGI("convertToFormat: inputPath=%s inputSizeBytes=%ld format=%s outputPath=%s", inputPath ? inputPath : "(null)", inputSizeBytes, formatHint ? formatHint : "", outputPath ? outputPath : "(null)");
60
+
329
61
  // Open input
330
62
  AVFormatContext* inFmt = nullptr;
331
63
  if (avformat_open_input(&inFmt, inputPath, nullptr, nullptr) < 0) {
64
+ LOGE("Failed to open input file (generic): inputPath=%s", inputPath ? inputPath : "(null)");
332
65
  return std::string("Failed to open input file");
333
66
  }
334
67
  if (avformat_find_stream_info(inFmt, nullptr) < 0) {
@@ -394,6 +127,15 @@ static std::string convertToFormat(const char* inputPath, const char* outputPath
394
127
  avformat_close_input(&inFmt);
395
128
  return std::string("libshine encoder not available in this build");
396
129
  }
130
+ } else if (codec_id == AV_CODEC_ID_OPUS) {
131
+ encoder = avcodec_find_encoder_by_name("libopus");
132
+ if (!encoder) {
133
+ avformat_free_context(outFmt);
134
+ swr_free(&swr);
135
+ avcodec_free_context(&decCtx);
136
+ avformat_close_input(&inFmt);
137
+ return std::string("libopus encoder not available in this build");
138
+ }
397
139
  } else {
398
140
  encoder = avcodec_find_encoder(codec_id);
399
141
  if (!encoder) {
@@ -461,6 +203,15 @@ static std::string convertToFormat(const char* inputPath, const char* outputPath
461
203
  // Set sample rate from input/decoder if not already set
462
204
  encCtx->sample_rate = inStream->codecpar->sample_rate ? inStream->codecpar->sample_rate : decCtx->sample_rate;
463
205
 
206
+ // WAV output: force 16 kHz mono S16 (sherpa-onnx STT requirement)
207
+ if (isWav) {
208
+ encCtx->sample_rate = 16000;
209
+ encCtx->sample_fmt = AV_SAMPLE_FMT_S16;
210
+ av_channel_layout_uninit(&encCtx->ch_layout);
211
+ AVChannelLayout mono = AV_CHANNEL_LAYOUT_MONO;
212
+ av_channel_layout_copy(&encCtx->ch_layout, &mono);
213
+ }
214
+
464
215
  // Probe encoder-supported configurations (sample formats, sample rates, channel layouts)
465
216
  AVSampleFormat chosen_fmt = AV_SAMPLE_FMT_NONE;
466
217
  const void *fmt_configs = nullptr;
@@ -475,13 +226,8 @@ static std::string convertToFormat(const char* inputPath, const char* outputPath
475
226
  int chl_num = 0;
476
227
  avcodec_get_supported_config(encCtx, encoder, AV_CODEC_CONFIG_CHANNEL_LAYOUT, 0, &chl_configs, &chl_num);
477
228
 
478
- // Log supported sample formats
479
229
  if (fmt_configs && fmt_num > 0) {
480
230
  const AVSampleFormat *fmts = (const AVSampleFormat *)fmt_configs;
481
- for (int i = 0; i < fmt_num; ++i) {
482
- const char *name = av_get_sample_fmt_name(fmts[i]);
483
- LOGI("encoder supported fmt[%d]=%s", i, name ? name : "?");
484
- }
485
231
  // prefer interleaved S16, then planar S16P, then decoder fmt, then first
486
232
  for (int i = 0; i < fmt_num; ++i) if (fmts[i] == AV_SAMPLE_FMT_S16) { chosen_fmt = AV_SAMPLE_FMT_S16; break; }
487
233
  if (chosen_fmt == AV_SAMPLE_FMT_NONE && codec_id == AV_CODEC_ID_MP3) {
@@ -492,7 +238,8 @@ static std::string convertToFormat(const char* inputPath, const char* outputPath
492
238
  }
493
239
  if (chosen_fmt == AV_SAMPLE_FMT_NONE && fmt_num > 0) chosen_fmt = fmts[0];
494
240
  } else {
495
- // libshine only supports S16P; default to S16P for MP3 so open succeeds
241
+ // libshine only supports S16P; default to S16P for MP3 so open succeeds.
242
+ // If AAC, it might prefer FLTP, which `chosen_fmt = fmts[0]` captures above if available.
496
243
  chosen_fmt = (codec_id == AV_CODEC_ID_MP3) ? AV_SAMPLE_FMT_S16P : AV_SAMPLE_FMT_S16;
497
244
  }
498
245
  encCtx->sample_fmt = chosen_fmt;
@@ -502,7 +249,6 @@ static std::string convertToFormat(const char* inputPath, const char* outputPath
502
249
  const int *srs = (const int*)sr_configs;
503
250
  int pick_sr = 0;
504
251
  for (int i = 0; i < sr_num; ++i) {
505
- LOGI("encoder supported sample_rate[%d]=%d", i, srs[i]);
506
252
  if (srs[i] == encCtx->sample_rate) { pick_sr = srs[i]; break; }
507
253
  }
508
254
  if (pick_sr == 0) pick_sr = srs[0];
@@ -511,10 +257,11 @@ static std::string convertToFormat(const char* inputPath, const char* outputPath
511
257
  // libshine only supports 32000, 44100, 48000 Hz. Use outputSampleRateHz if valid (32000/44100/48000), else default 44100.
512
258
  if (codec_id == AV_CODEC_ID_MP3) {
513
259
  int want = (outputSampleRateHz == 32000 || outputSampleRateHz == 44100 || outputSampleRateHz == 48000) ? outputSampleRateHz : 44100;
514
- if (encCtx->sample_rate != want) {
515
- LOGI("libshine: setting sample_rate %d (requested %d)", want, outputSampleRateHz);
516
- encCtx->sample_rate = want;
517
- }
260
+ if (encCtx->sample_rate != want) encCtx->sample_rate = want;
261
+ }
262
+ if (codec_id == AV_CODEC_ID_OPUS) {
263
+ int want = (outputSampleRateHz == 8000 || outputSampleRateHz == 12000 || outputSampleRateHz == 16000 || outputSampleRateHz == 24000 || outputSampleRateHz == 48000) ? outputSampleRateHz : 48000;
264
+ if (encCtx->sample_rate != want) encCtx->sample_rate = want;
518
265
  }
519
266
 
520
267
  // If supported channel layouts given, prefer matching channels else pick first
@@ -523,9 +270,6 @@ static std::string convertToFormat(const char* inputPath, const char* outputPath
523
270
  int pick_nb = 0;
524
271
  for (int i = 0; i < chl_num; ++i) {
525
272
  const AVChannelLayout *l = &layouts[i];
526
- char buf[128];
527
- av_channel_layout_describe(l, buf, sizeof(buf));
528
- LOGI("encoder supported ch_layout[%d]=%s nb_channels=%d", i, buf, l->nb_channels);
529
273
  if (l->nb_channels == encCtx->ch_layout.nb_channels) { pick_nb = l->nb_channels; break; }
530
274
  }
531
275
  if (pick_nb == 0) pick_nb = layouts[0].nb_channels > 0 ? layouts[0].nb_channels : 1;
@@ -548,7 +292,7 @@ static std::string convertToFormat(const char* inputPath, const char* outputPath
548
292
  }
549
293
 
550
294
  // Set a sensible default bitrate for compressed codecs
551
- if (codec_id == AV_CODEC_ID_MP3 || codec_id == AV_CODEC_ID_AAC) encCtx->bit_rate = 128000;
295
+ if (codec_id == AV_CODEC_ID_MP3 || codec_id == AV_CODEC_ID_AAC || codec_id == AV_CODEC_ID_OPUS) encCtx->bit_rate = 128000;
552
296
  else encCtx->bit_rate = 0; // lossless or PCM may ignore
553
297
 
554
298
  if (outFmt->oformat->flags & AVFMT_GLOBALHEADER) encCtx->flags |= AV_CODEC_FLAG_GLOBAL_HEADER;
@@ -614,10 +358,10 @@ static std::string convertToFormat(const char* inputPath, const char* outputPath
614
358
  }
615
359
  }
616
360
 
617
- // Last resort: try S16 then S16P (for FLAC etc.)
361
+ // Last resort: try S16, S16P, then FLTP (for AAC etc.)
618
362
  if (ret < 0) {
619
- AVSampleFormat fallbacks[] = { AV_SAMPLE_FMT_S16, AV_SAMPLE_FMT_S16P };
620
- for (int fi = 0; fi < 2 && ret < 0; ++fi) {
363
+ AVSampleFormat fallbacks[] = { AV_SAMPLE_FMT_S16, AV_SAMPLE_FMT_S16P, AV_SAMPLE_FMT_FLTP };
364
+ for (int fi = 0; fi < 3 && ret < 0; ++fi) {
621
365
  encCtx->sample_fmt = fallbacks[fi];
622
366
  AVDictionary *try_opts = nullptr;
623
367
  snprintf(tmpbuf, sizeof(tmpbuf), "%d", encCtx->ch_layout.nb_channels > 0 ? encCtx->ch_layout.nb_channels : 1);
@@ -742,73 +486,203 @@ static std::string convertToFormat(const char* inputPath, const char* outputPath
742
486
  avformat_close_input(&inFmt);
743
487
  return std::string("Failed to initialize resampler");
744
488
  }
489
+ {
490
+ int initRet = swr_init(swr);
491
+ if (initRet < 0) {
492
+ char errbuf[256];
493
+ av_strerror(initRet, errbuf, sizeof(errbuf));
494
+ LOGE("convertToFormat: swr_init failed: %s", errbuf);
495
+ av_channel_layout_uninit(&in_ch_layout2);
496
+ swr_free(&swr);
497
+ av_channel_layout_uninit(&resampled->ch_layout);
498
+ av_frame_free(&frame);
499
+ av_frame_free(&resampled);
500
+ av_packet_free(&pkt);
501
+ avcodec_free_context(&encCtx);
502
+ avformat_free_context(outFmt);
503
+ avcodec_free_context(&decCtx);
504
+ avformat_close_input(&inFmt);
505
+ return std::string("Failed to initialize resampler (swr_init)");
506
+ }
507
+ }
745
508
  av_channel_layout_uninit(&in_ch_layout2);
746
509
 
510
+ int totalDecodedFrames = 0;
511
+ int totalFramesSent = 0;
512
+ int totalPacketsFromEncoder = 0;
513
+ int flushPackets = 0;
514
+ int64_t encoder_pts = 0;
515
+
516
+ // Many encoders prefer / require a specific frame size (nb_samples) when using send_frame().
517
+ // MP3 (libshine) requires 1152 samples per frame.
518
+ // For others (e.g. FLAC), use encCtx->frame_size when available; otherwise use a conservative default.
519
+ const int default_frame_size = 1024;
520
+ const int enc_frame_size =
521
+ (codec_id == AV_CODEC_ID_MP3) ? 1152 :
522
+ (encCtx->frame_size > 0 ? encCtx->frame_size : default_frame_size);
523
+ int out_ch2 = encCtx->ch_layout.nb_channels;
524
+ if (out_ch2 <= 0) out_ch2 = 1;
525
+ int bytes_per_sample = av_get_bytes_per_sample(encCtx->sample_fmt);
526
+
527
+ // Accumulation buffer for resampled samples. Use read offset to avoid O(n²) memmove;
528
+ // compact only when offset exceeds threshold.
529
+ std::vector<uint8_t> accumBuf;
530
+ size_t accumReadOffset = 0; // bytes consumed from start (avoids O(n²) memmove)
531
+ const int bytesPerFrame = bytes_per_sample * out_ch2;
532
+ int accumSamples = 0;
533
+
534
+ const size_t kCompactThreshold = 256 * 1024; // compact when read offset exceeds 256 KB
535
+
536
+ auto maybeCompact = [&]() {
537
+ if (accumReadOffset == 0) return;
538
+ if (accumReadOffset < kCompactThreshold && accumReadOffset * 2 < accumBuf.size()) return;
539
+ size_t valid = accumBuf.size() - accumReadOffset;
540
+ if (valid > 0) memmove(accumBuf.data(), accumBuf.data() + accumReadOffset, valid);
541
+ accumBuf.resize(valid);
542
+ accumReadOffset = 0;
543
+ };
544
+
545
+ // Helper lambda: send exactly enc_frame_size samples from accumBuf to encoder
546
+ auto flushAccumFrames = [&](bool sendPartial) {
547
+ int needed = enc_frame_size;
548
+ if (needed <= 0) return;
549
+
550
+ while (accumSamples >= needed || (sendPartial && accumSamples > 0)) {
551
+ int toSend = (accumSamples >= needed) ? needed : accumSamples;
552
+ AVFrame* ef = av_frame_alloc();
553
+ if (!ef) break;
554
+ ef->format = encCtx->sample_fmt;
555
+ ef->sample_rate = encCtx->sample_rate;
556
+ if (av_channel_layout_copy(&ef->ch_layout, &encCtx->ch_layout) < 0) { av_frame_free(&ef); break; }
557
+ ef->nb_samples = toSend;
558
+ if (av_frame_get_buffer(ef, 0) < 0) { av_channel_layout_uninit(&ef->ch_layout); av_frame_free(&ef); break; }
559
+ int copyBytes = toSend * bytesPerFrame;
560
+ memcpy(ef->data[0], accumBuf.data() + accumReadOffset, copyBytes);
561
+ ef->pts = encoder_pts;
562
+ encoder_pts += toSend;
563
+
564
+ accumReadOffset += (size_t)copyBytes;
565
+ accumSamples -= toSend;
566
+
567
+ // Send to encoder with EAGAIN handling
568
+ for (;;) {
569
+ int ret = avcodec_send_frame(encCtx, ef);
570
+ if (ret == 0) break;
571
+ if (ret == AVERROR(EAGAIN)) {
572
+ AVPacket* op = av_packet_alloc();
573
+ while (avcodec_receive_packet(encCtx, op) == 0) {
574
+ op->stream_index = outStream->index;
575
+ av_packet_rescale_ts(op, encCtx->time_base, outStream->time_base);
576
+ av_interleaved_write_frame(outFmt, op);
577
+ av_packet_unref(op);
578
+ totalPacketsFromEncoder++;
579
+ }
580
+ av_packet_free(&op);
581
+ continue;
582
+ }
583
+ LOGW("convertToFormat: send_frame ret=%d frame=%d pts=%lld nb=%d", ret, totalFramesSent, (long long)ef->pts, toSend);
584
+ break;
585
+ }
586
+ // Drain any ready packets
587
+ AVPacket* op = av_packet_alloc();
588
+ while (avcodec_receive_packet(encCtx, op) == 0) {
589
+ op->stream_index = outStream->index;
590
+ av_packet_rescale_ts(op, encCtx->time_base, outStream->time_base);
591
+ av_interleaved_write_frame(outFmt, op);
592
+ av_packet_unref(op);
593
+ totalPacketsFromEncoder++;
594
+ }
595
+ av_packet_free(&op);
596
+
597
+ av_channel_layout_uninit(&ef->ch_layout);
598
+ av_frame_free(&ef);
599
+ totalFramesSent++;
600
+
601
+ if (!sendPartial && accumSamples < needed) break;
602
+ }
603
+ };
604
+
747
605
  while (av_read_frame(inFmt, pkt) >= 0) {
748
606
  if (pkt->stream_index == audioStreamIndex) {
749
607
  if (avcodec_send_packet(decCtx, pkt) == 0) {
750
608
  while (avcodec_receive_frame(decCtx, frame) == 0) {
609
+ totalDecodedFrames++;
751
610
  int in_sr2 = inStream->codecpar->sample_rate ? inStream->codecpar->sample_rate : decCtx->sample_rate;
752
611
  int64_t out_nb_samples = av_rescale_rnd(swr_get_delay(swr, in_sr2) + frame->nb_samples, encCtx->sample_rate, in_sr2, AV_ROUND_UP);
753
612
  uint8_t** outData = nullptr;
754
- int out_ch2 = encCtx->ch_layout.nb_channels;
755
- if (out_ch2 <= 0) out_ch2 = 1;
756
613
  if (av_samples_alloc_array_and_samples(&outData, nullptr, out_ch2, (int)out_nb_samples, encCtx->sample_fmt, 0) < 0) {
757
614
  av_packet_unref(pkt);
758
615
  continue;
759
616
  }
760
617
  int converted = swr_convert(swr, outData, (int)out_nb_samples, (const uint8_t**)frame->data, frame->nb_samples);
761
- if (converted < 0) {
618
+ if (converted <= 0) {
762
619
  av_freep(&outData[0]);
763
620
  av_freep(&outData);
764
621
  continue;
765
622
  }
766
623
 
767
- resampled->nb_samples = converted;
768
- if (av_frame_get_buffer(resampled, 0) < 0) {
769
- av_freep(&outData[0]);
770
- av_freep(&outData);
771
- continue;
772
- }
773
- int bytes_per_sample = av_get_bytes_per_sample((AVSampleFormat)resampled->format);
774
- int copy_size2 = converted * bytes_per_sample * out_ch2;
775
- memcpy(resampled->data[0], outData[0], copy_size2);
776
-
777
- if (avcodec_send_frame(encCtx, resampled) == 0) {
778
- AVPacket* outPkt = av_packet_alloc();
779
- while (avcodec_receive_packet(encCtx, outPkt) == 0) {
780
- outPkt->stream_index = outStream->index;
781
- av_packet_rescale_ts(outPkt, encCtx->time_base, outStream->time_base);
782
- av_interleaved_write_frame(outFmt, outPkt);
783
- av_packet_unref(outPkt);
784
- }
785
- av_packet_free(&outPkt);
786
- }
624
+
625
+ int newBytes = converted * bytes_per_sample * out_ch2;
626
+ maybeCompact();
627
+ size_t oldSize = accumBuf.size();
628
+ accumBuf.resize(oldSize + (size_t)newBytes);
629
+ memcpy(accumBuf.data() + oldSize, outData[0], (size_t)newBytes);
630
+ accumSamples += converted;
787
631
 
788
632
  av_freep(&outData[0]);
789
633
  av_freep(&outData);
790
- av_frame_unref(resampled);
791
634
  av_frame_unref(frame);
635
+
636
+ flushAccumFrames(false);
792
637
  }
793
638
  }
794
639
  }
795
640
  av_packet_unref(pkt);
796
641
  }
797
642
 
643
+ // Drain any remaining samples in swr (resampler delay)
644
+ {
645
+ uint8_t** tailData = nullptr;
646
+ int tailCap = swr_get_delay(swr, encCtx->sample_rate) + 256;
647
+ if (tailCap > 0 && av_samples_alloc_array_and_samples(&tailData, nullptr, out_ch2, tailCap, encCtx->sample_fmt, 0) >= 0) {
648
+ int tailConverted = swr_convert(swr, tailData, tailCap, nullptr, 0);
649
+ if (tailConverted > 0) {
650
+ int tailBytes = tailConverted * bytes_per_sample * out_ch2;
651
+ maybeCompact();
652
+ size_t oldSize = accumBuf.size();
653
+ accumBuf.resize(oldSize + (size_t)tailBytes);
654
+ memcpy(accumBuf.data() + oldSize, tailData[0], (size_t)tailBytes);
655
+ accumSamples += tailConverted;
656
+ }
657
+ av_freep(&tailData[0]);
658
+ av_freep(&tailData);
659
+ }
660
+ }
661
+ // Send remaining (partial) frames
662
+ flushAccumFrames(true);
663
+
664
+ (void)totalDecodedFrames; (void)totalPacketsFromEncoder;
665
+
798
666
  // Flush encoder
799
667
  avcodec_send_frame(encCtx, nullptr);
800
668
  AVPacket* outPkt2 = av_packet_alloc();
801
669
  while (avcodec_receive_packet(encCtx, outPkt2) == 0) {
670
+ flushPackets++;
802
671
  outPkt2->stream_index = outStream->index;
803
672
  av_packet_rescale_ts(outPkt2, encCtx->time_base, outStream->time_base);
804
673
  av_interleaved_write_frame(outFmt, outPkt2);
805
674
  av_packet_unref(outPkt2);
806
675
  }
807
676
  av_packet_free(&outPkt2);
677
+ (void)flushPackets;
808
678
 
809
679
  av_write_trailer(outFmt);
810
680
  if (!(outFmt->oformat->flags & AVFMT_NOFILE)) avio_closep(&outFmt->pb);
811
681
 
682
+ struct stat stOut = {};
683
+ long outputSizeBytes = (stat(outputPath, &stOut) == 0 && S_ISREG(stOut.st_mode)) ? (long)stOut.st_size : -1;
684
+ LOGI("convertToFormat: done outputPath=%s outputSizeBytes=%ld", outputPath ? outputPath : "(null)", outputSizeBytes);
685
+
812
686
  av_packet_free(&pkt);
813
687
  av_frame_free(&frame);
814
688
  av_channel_layout_uninit(&resampled->ch_layout);