npm - react-native-sherpa-onnx - Versions diffs - 0.3.2 → 0.3.4 - Mend

react-native-sherpa-onnx 0.3.2 → 0.3.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (83) hide show

package/README.md +84 -77
package/SherpaOnnx.podspec +79 -45
package/android/build.gradle +8 -2
package/android/prebuilt-download.gradle +70 -16
package/android/prebuilt-versions.gradle +14 -6
package/android/src/main/cpp/CMakeLists.txt +2 -0
package/android/src/main/cpp/jni/audio/sherpa-onnx-audio-convert-jni.cpp +202 -328
package/android/src/main/cpp/jni/model_detect/sherpa-onnx-detect-jni-common.cpp +22 -0
package/android/src/main/cpp/jni/model_detect/sherpa-onnx-detect-jni-common.h +2 -0
package/android/src/main/cpp/jni/model_detect/sherpa-onnx-model-detect-helper.cpp +96 -142
package/android/src/main/cpp/jni/model_detect/sherpa-onnx-model-detect-helper.h +40 -4
package/android/src/main/cpp/jni/model_detect/sherpa-onnx-model-detect-stt.cpp +774 -316
package/android/src/main/cpp/jni/model_detect/sherpa-onnx-model-detect-tts.cpp +208 -122
package/android/src/main/cpp/jni/model_detect/sherpa-onnx-model-detect.h +92 -0
package/android/src/main/cpp/jni/model_detect/sherpa-onnx-stt-wrapper.cpp +3 -0
package/android/src/main/cpp/jni/model_detect/sherpa-onnx-tts-wrapper.cpp +14 -2
package/android/src/main/cpp/jni/model_detect/sherpa-onnx-validate-stt.cpp +229 -0
package/android/src/main/cpp/jni/model_detect/sherpa-onnx-validate-stt.h +38 -0
package/android/src/main/cpp/jni/model_detect/sherpa-onnx-validate-tts.cpp +144 -0
package/android/src/main/cpp/jni/model_detect/sherpa-onnx-validate-tts.h +38 -0
package/android/src/main/cpp/jni/module/sherpa-onnx-module-jni.cpp +1 -1
package/android/src/main/java/com/sherpaonnx/SherpaOnnxModule.kt +157 -11
package/android/src/main/java/com/sherpaonnx/SherpaOnnxPcmCapture.kt +150 -0
package/android/src/main/java/com/sherpaonnx/SherpaOnnxSttHelper.kt +75 -24
package/android/src/main/java/com/sherpaonnx/SherpaOnnxTtsHelper.kt +52 -1
package/ios/SherpaOnnx+PcmLiveStream.mm +288 -0
package/ios/SherpaOnnx+STT.mm +2 -0
package/ios/SherpaOnnx+TTS.mm +17 -0
package/ios/SherpaOnnx.mm +27 -3
package/ios/SherpaOnnxAudioConvert.h +28 -0
package/ios/SherpaOnnxAudioConvert.mm +698 -0
package/ios/archive/sherpa-onnx-archive-helper.mm +12 -0
package/ios/model_detect/sherpa-onnx-model-detect-helper.h +37 -3
package/ios/model_detect/sherpa-onnx-model-detect-helper.mm +80 -45
package/ios/model_detect/sherpa-onnx-model-detect-stt.mm +629 -267
package/ios/model_detect/sherpa-onnx-model-detect-tts.mm +148 -56
package/ios/model_detect/sherpa-onnx-model-detect.h +72 -0
package/ios/model_detect/sherpa-onnx-validate-stt.h +38 -0
package/ios/model_detect/sherpa-onnx-validate-stt.mm +229 -0
package/ios/model_detect/sherpa-onnx-validate-tts.h +38 -0
package/ios/model_detect/sherpa-onnx-validate-tts.mm +144 -0
package/ios/stt/sherpa-onnx-stt-wrapper.mm +4 -0
package/lib/module/NativeSherpaOnnx.js.map +1 -1
package/lib/module/audio/index.js +55 -1
package/lib/module/audio/index.js.map +1 -1
package/lib/module/download/ModelDownloadManager.js +14 -0
package/lib/module/download/ModelDownloadManager.js.map +1 -1
package/lib/module/index.js +10 -0
package/lib/module/index.js.map +1 -1
package/lib/module/stt/streaming.js +6 -3
package/lib/module/stt/streaming.js.map +1 -1
package/lib/module/tts/index.js +13 -1
package/lib/module/tts/index.js.map +1 -1
package/lib/typescript/src/NativeSherpaOnnx.d.ts +32 -3
package/lib/typescript/src/NativeSherpaOnnx.d.ts.map +1 -1
package/lib/typescript/src/audio/index.d.ts +20 -1
package/lib/typescript/src/audio/index.d.ts.map +1 -1
package/lib/typescript/src/download/ModelDownloadManager.d.ts +2 -1
package/lib/typescript/src/download/ModelDownloadManager.d.ts.map +1 -1
package/lib/typescript/src/index.d.ts +10 -0
package/lib/typescript/src/index.d.ts.map +1 -1
package/lib/typescript/src/stt/streaming.d.ts.map +1 -1
package/lib/typescript/src/stt/streamingTypes.d.ts +1 -1
package/lib/typescript/src/stt/streamingTypes.d.ts.map +1 -1
package/lib/typescript/src/tts/index.d.ts +12 -1
package/lib/typescript/src/tts/index.d.ts.map +1 -1
package/package.json +6 -1
package/scripts/check-model-csvs.sh +72 -0
package/scripts/setup-ios-framework.sh +272 -191
package/src/NativeSherpaOnnx.ts +37 -3
package/src/audio/index.ts +84 -1
package/src/download/ModelDownloadManager.ts +19 -0
package/src/index.tsx +15 -0
package/src/stt/streaming.ts +10 -5
package/src/stt/streamingTypes.ts +1 -1
package/src/tts/index.ts +25 -1
package/third_party/ffmpeg_prebuilt/ANDROID_RELEASE_TAG +1 -1
package/third_party/libarchive_prebuilt/ANDROID_RELEASE_TAG +1 -1
package/third_party/libarchive_prebuilt/IOS_RELEASE_TAG +1 -1
package/third_party/sherpa-onnx-prebuilt/ANDROID_RELEASE_TAG +1 -1
package/third_party/sherpa-onnx-prebuilt/IOS_RELEASE_TAG +1 -1
package/ios/scripts/patch-libarchive-includes.sh +0 -61
package/ios/scripts/setup-ios-libarchive.sh +0 -98

package/android/src/main/cpp/jni/audio/sherpa-onnx-audio-convert-jni.cpp CHANGED Viewed

@@ -8,6 +8,7 @@
 #include <android/log.h>
 #include <jni.h>
 #include <string>
+#include <sys/stat.h>
 #define LOG_TAG "AudioConvertJNI"
 #define LOGI(...) __android_log_print(ANDROID_LOG_INFO, LOG_TAG, __VA_ARGS__)
@@ -19,316 +20,48 @@ extern "C" {
 #include <libavcodec/avcodec.h>
 #include <libavformat/avformat.h>
 #include <libavutil/opt.h>
+#include <libavutil/error.h>
 #include <libswresample/swresample.h>
 }
 #include <cstdio>
 #include <vector>
 #endif
-// Returns empty string on success, or error message on failure.
-// Output is always 16 kHz mono 16-bit PCM (sherpa-onnx requirement). Input can be any rate; we resample to 16k.
-static std::string convertToWav16kMono(const char* inputPath, const char* outputPath) {
-#ifdef HAVE_FFMPEG
-    // Implement a basic decode -> resample -> write WAV pipeline using libav* APIs.
-    av_log_set_level(AV_LOG_ERROR);
-    AVFormatContext* inFmt = nullptr;
-    if (avformat_open_input(&inFmt, inputPath, nullptr, nullptr) < 0) {
-        return std::string("Failed to open input file");
-    }
-    if (avformat_find_stream_info(inFmt, nullptr) < 0) {
-        avformat_close_input(&inFmt);
-        return std::string("Failed to find stream info");
-    }
-    int audioStreamIndex = -1;
-    for (unsigned i = 0; i < inFmt->nb_streams; ++i) {
-        if (inFmt->streams[i]->codecpar->codec_type == AVMEDIA_TYPE_AUDIO) {
-            audioStreamIndex = i;
-            break;
-        }
-    }
-    if (audioStreamIndex < 0) {
-        avformat_close_input(&inFmt);
-        return std::string("No audio stream found in input");
-    }
-    AVStream* inStream = inFmt->streams[audioStreamIndex];
-    const AVCodec* decoder = avcodec_find_decoder(inStream->codecpar->codec_id);
-    if (!decoder) {
-        avformat_close_input(&inFmt);
-        return std::string("Unsupported input codec");
-    }
-    AVCodecContext* decCtx = avcodec_alloc_context3(decoder);
-    if (!decCtx) {
-        avformat_close_input(&inFmt);
-        return std::string("Failed to allocate decoder context");
-    }
-    if (avcodec_parameters_to_context(decCtx, inStream->codecpar) < 0) {
-        avcodec_free_context(&decCtx);
-        avformat_close_input(&inFmt);
-        return std::string("Failed to copy codec parameters");
-    }
-    if (avcodec_open2(decCtx, decoder, nullptr) < 0) {
-        avcodec_free_context(&decCtx);
-        avformat_close_input(&inFmt);
-        return std::string("Failed to open decoder");
-    }
-    // Prepare resampler to 16k mono s16 using AVChannelLayout helpers
-    SwrContext* swr = nullptr;
-    AVChannelLayout out_ch_layout = AV_CHANNEL_LAYOUT_MONO;
-    AVChannelLayout in_ch_layout;
-    // Prefer codecpar ch_layout when available, otherwise fall back to decoder ctx
-    if (inStream->codecpar->ch_layout.nb_channels) {
-        if (av_channel_layout_copy(&in_ch_layout, &inStream->codecpar->ch_layout) < 0) {
-            avcodec_free_context(&decCtx);
-            avformat_close_input(&inFmt);
-            return std::string("Failed to copy input channel layout");
-        }
-    } else {
-        if (av_channel_layout_copy(&in_ch_layout, &decCtx->ch_layout) < 0) {
-            avcodec_free_context(&decCtx);
-            avformat_close_input(&inFmt);
-            return std::string("Failed to initialize input channel layout");
-        }
-    }
-    if (swr_alloc_set_opts2(&swr,
-            &out_ch_layout, AV_SAMPLE_FMT_S16, 16000,
-            &in_ch_layout, (AVSampleFormat)decCtx->sample_fmt, decCtx->sample_rate,
-            0, nullptr) < 0 || !swr) {
-        av_channel_layout_uninit(&in_ch_layout);
-        if (swr) swr_free(&swr);
-        avcodec_free_context(&decCtx);
-        avformat_close_input(&inFmt);
-        return std::string("Failed to initialize resampler");
-    }
-    av_channel_layout_uninit(&in_ch_layout);
-    // Prepare output WAV via avformat
-    AVFormatContext* outFmt = nullptr;
-    if (avformat_alloc_output_context2(&outFmt, nullptr, nullptr, outputPath) < 0 || !outFmt) {
-        swr_free(&swr);
-        avcodec_free_context(&decCtx);
-        avformat_close_input(&inFmt);
-        return std::string("Failed to allocate output context");
-    }
-    const AVCodec* pcmCodec = avcodec_find_encoder(AV_CODEC_ID_PCM_S16LE);
-    if (!pcmCodec) {
-        avformat_free_context(outFmt);
-        swr_free(&swr);
-        avcodec_free_context(&decCtx);
-        avformat_close_input(&inFmt);
-        return std::string("PCM encoder not found");
-    }
-    AVStream* outStream = avformat_new_stream(outFmt, nullptr);
-    if (!outStream) {
-        avformat_free_context(outFmt);
-        swr_free(&swr);
-        avcodec_free_context(&decCtx);
-        avformat_close_input(&inFmt);
-        return std::string("Failed to create output stream");
-    }
-    AVCodecContext* encCtx = avcodec_alloc_context3(pcmCodec);
-    // Configure encoder context for mono 16k s16 output
-    AVChannelLayout mono_layout = AV_CHANNEL_LAYOUT_MONO;
-    if (!encCtx) {
-        avformat_free_context(outFmt);
-        swr_free(&swr);
-        avcodec_free_context(&decCtx);
-        avformat_close_input(&inFmt);
-        return std::string("Failed to allocate encoder context");
-    }
-    if (av_channel_layout_copy(&encCtx->ch_layout, &mono_layout) < 0) {
-        avcodec_free_context(&encCtx);
-        avformat_free_context(outFmt);
-        swr_free(&swr);
-        avcodec_free_context(&decCtx);
-        avformat_close_input(&inFmt);
-        return std::string("Failed to set encoder channel layout");
-    }
-    encCtx->sample_rate = 16000;
-    encCtx->sample_fmt = AV_SAMPLE_FMT_S16;
-    encCtx->bit_rate = 16 * 16000; // rough
-    if (outFmt->oformat->flags & AVFMT_GLOBALHEADER) encCtx->flags |= AV_CODEC_FLAG_GLOBAL_HEADER;
-    if (avcodec_open2(encCtx, pcmCodec, nullptr) < 0) {
-        avcodec_free_context(&encCtx);
-        avformat_free_context(outFmt);
-        swr_free(&swr);
-        avcodec_free_context(&decCtx);
-        avformat_close_input(&inFmt);
-        return std::string("Failed to open PCM encoder");
-    }
-    if (avcodec_parameters_from_context(outStream->codecpar, encCtx) < 0) {
-        avcodec_free_context(&encCtx);
-        avformat_free_context(outFmt);
-        swr_free(&swr);
-        avcodec_free_context(&decCtx);
-        avformat_close_input(&inFmt);
-        return std::string("Failed to set output stream parameters");
-    }
-    if (!(outFmt->oformat->flags & AVFMT_NOFILE)) {
-        if (avio_open(&outFmt->pb, outputPath, AVIO_FLAG_WRITE) < 0) {
-            avcodec_free_context(&encCtx);
-            avformat_free_context(outFmt);
-            swr_free(&swr);
-            avcodec_free_context(&decCtx);
-            avformat_close_input(&inFmt);
-            return std::string("Failed to open output file for writing");
-        }
-    }
-    if (avformat_write_header(outFmt, nullptr) < 0) {
-        if (!(outFmt->oformat->flags & AVFMT_NOFILE)) avio_closep(&outFmt->pb);
-        avcodec_free_context(&encCtx);
-        avformat_free_context(outFmt);
-        swr_free(&swr);
-        avcodec_free_context(&decCtx);
-        avformat_close_input(&inFmt);
-        return std::string("Failed to write output header");
-    }
-    AVPacket* pkt = av_packet_alloc();
-    AVFrame* frame = av_frame_alloc();
-    AVFrame* resampled = av_frame_alloc();
-    // Configure resampled frame metadata
-    resampled->format = AV_SAMPLE_FMT_S16;
-    resampled->sample_rate = 16000;
-    // set channel layout on frame
-    AVChannelLayout out_ch_layout_local = AV_CHANNEL_LAYOUT_MONO;
-    if (av_channel_layout_copy(&resampled->ch_layout, &out_ch_layout_local) < 0) {
-        av_frame_free(&frame);
-        av_frame_free(&resampled);
-        swr_free(&swr);
-        avcodec_free_context(&decCtx);
-        avformat_close_input(&inFmt);
-        return std::string("Failed to set resampled frame channel layout");
-    }
-    // Buffer for resampled data will be allocated per needed samples
-    while (av_read_frame(inFmt, pkt) >= 0) {
-        if (pkt->stream_index == audioStreamIndex) {
-            if (avcodec_send_packet(decCtx, pkt) == 0) {
-                while (avcodec_receive_frame(decCtx, frame) == 0) {
-                    // Resample
-                    int in_sr = inStream->codecpar->sample_rate ? inStream->codecpar->sample_rate : decCtx->sample_rate;
-                    int64_t out_nb_samples = av_rescale_rnd(swr_get_delay(swr, in_sr) + frame->nb_samples, 16000, in_sr, AV_ROUND_UP);
-                    uint8_t** outData = nullptr;
-                    int out_channels = resampled->ch_layout.nb_channels;
-                    if (out_channels <= 0) out_channels = 1;
-                    if (av_samples_alloc_array_and_samples(&outData, nullptr, out_channels, (int)out_nb_samples, AV_SAMPLE_FMT_S16, 0) < 0) {
-                        av_packet_unref(pkt);
-                        continue;
-                    }
-                    int converted = swr_convert(swr, outData, (int)out_nb_samples, (const uint8_t**)frame->data, frame->nb_samples);
-                    if (converted < 0) {
-                        av_freep(&outData[0]);
-                        av_freep(&outData);
-                        continue;
-                    }
-                    // prepare frame for encoder
-                    resampled->nb_samples = converted;
-                    if (av_frame_get_buffer(resampled, 0) < 0) {
-                        av_freep(&outData[0]);
-                        av_freep(&outData);
-                        continue;
-                    }
-                    // copy data into resampled frame
-                    int bytes_per_sample = av_get_bytes_per_sample((AVSampleFormat)resampled->format);
-                    int copy_size = converted * bytes_per_sample * out_channels;
-                    memcpy(resampled->data[0], outData[0], copy_size);
-                    // send to encoder
-                    if (avcodec_send_frame(encCtx, resampled) == 0) {
-                        AVPacket* outPkt = av_packet_alloc();
-                        while (avcodec_receive_packet(encCtx, outPkt) == 0) {
-                            outPkt->stream_index = outStream->index;
-                            av_packet_rescale_ts(outPkt, encCtx->time_base, outStream->time_base);
-                            av_interleaved_write_frame(outFmt, outPkt);
-                            av_packet_unref(outPkt);
-                        }
-                        av_packet_free(&outPkt);
-                    }
-                    av_freep(&outData[0]);
-                    av_freep(&outData);
-                    av_frame_unref(resampled);
-                    av_frame_unref(frame);
-                }
-            }
-        }
-        av_packet_unref(pkt);
-    }
-    // Flush encoder
-    avcodec_send_frame(encCtx, nullptr);
-    AVPacket* outPkt = av_packet_alloc();
-    while (avcodec_receive_packet(encCtx, outPkt) == 0) {
-        outPkt->stream_index = outStream->index;
-        av_packet_rescale_ts(outPkt, encCtx->time_base, outStream->time_base);
-        av_interleaved_write_frame(outFmt, outPkt);
-        av_packet_unref(outPkt);
-    }
-    av_packet_free(&outPkt);
-    av_write_trailer(outFmt);
-    if (!(outFmt->oformat->flags & AVFMT_NOFILE)) avio_closep(&outFmt->pb);
+// Forward declaration — convertToFormat handles all formats including WAV (16 kHz mono).
+static std::string convertToFormat(const char* inputPath, const char* outputPath, const char* formatHint, int outputSampleRateHz);
-    av_packet_free(&pkt);
-    av_frame_free(&frame);
-    av_channel_layout_uninit(&resampled->ch_layout);
-    av_frame_free(&resampled);
-    swr_free(&swr);
-    avcodec_free_context(&encCtx);
-    avformat_free_context(outFmt);
-    avcodec_free_context(&decCtx);
-    avformat_close_input(&inFmt);
-    return std::string("");
-#else
-    (void)inputPath;
-    (void)outputPath;
-    return "FFmpeg not available. Build prebuilts with third_party/ffmpeg_prebuilt/build_ffmpeg.ps1 or build_ffmpeg.sh.";
-#endif
+// Convenience: convert any audio to 16 kHz mono WAV via the main convertToFormat pipeline.
+static std::string convertToWav16kMono(const char* inputPath, const char* outputPath) {
+    return convertToFormat(inputPath, outputPath, "wav", 16000);
 }
 // Generic conversion: supports writing WAV/MP3/FLAC depending on output file extension and linked encoders.
-// WAV path always uses convertToWav16kMono (16 kHz mono out for sherpa-onnx). outputSampleRateHz is only used for MP3 (libshine: 32000/44100/48000); 0 = default 44100.
+// WAV output is 16 kHz mono PCM (sherpa-onnx). outputSampleRateHz is only used for MP3 (libshine: 32000/44100/48000); 0 = default 44100.
 static std::string convertToFormat(const char* inputPath, const char* outputPath, const char* formatHint, int outputSampleRateHz) {
 #ifdef HAVE_FFMPEG
-    // WAV output is always 16 kHz mono via convertToWav16kMono (sherpa-onnx). Input WAV at 16k is resampled 16k->16k (no change).
     std::string fmt(formatHint ? formatHint : "");
-    if (fmt == "wav" || fmt == "wav16k") {
-        return convertToWav16kMono(inputPath, outputPath);
-    }
+    bool isWav = (fmt == "wav" || fmt == "wav16k" || fmt.empty());
-    // Try to determine codec id from format hint
     AVCodecID codec_id = AV_CODEC_ID_NONE;
-    if (fmt == "mp3") codec_id = AV_CODEC_ID_MP3;
+    if (isWav) codec_id = AV_CODEC_ID_PCM_S16LE;
+    else if (fmt == "mp3") codec_id = AV_CODEC_ID_MP3;
     else if (fmt == "flac") codec_id = AV_CODEC_ID_FLAC;
-    else {
-        // fallback to WAV
-        return convertToWav16kMono(inputPath, outputPath);
-    }
+    else if (fmt == "m4a" || fmt == "aac") codec_id = AV_CODEC_ID_AAC;
+    else if (fmt == "opus" || fmt == "oggm" || fmt == "ogg" || fmt == "webm" || fmt == "mkv") codec_id = AV_CODEC_ID_OPUS;
+    else codec_id = AV_CODEC_ID_PCM_S16LE;
     // The implementation for generic encoding uses the same decode+resample pipeline
     // but selects encoder by codec_id and creates an output container based on file extension.
     // For brevity we reuse much of the WAV path but change encoder selection.
+    struct stat stIn = {};
+    long inputSizeBytes = (stat(inputPath, &stIn) == 0 && S_ISREG(stIn.st_mode)) ? (long)stIn.st_size : -1;
+    LOGI("convertToFormat: inputPath=%s inputSizeBytes=%ld format=%s outputPath=%s", inputPath ? inputPath : "(null)", inputSizeBytes, formatHint ? formatHint : "", outputPath ? outputPath : "(null)");
     // Open input
     AVFormatContext* inFmt = nullptr;
     if (avformat_open_input(&inFmt, inputPath, nullptr, nullptr) < 0) {
+        LOGE("Failed to open input file (generic): inputPath=%s", inputPath ? inputPath : "(null)");
         return std::string("Failed to open input file");
     }
     if (avformat_find_stream_info(inFmt, nullptr) < 0) {
@@ -394,6 +127,15 @@ static std::string convertToFormat(const char* inputPath, const char* outputPath
             avformat_close_input(&inFmt);
             return std::string("libshine encoder not available in this build");
         }
+    } else if (codec_id == AV_CODEC_ID_OPUS) {
+        encoder = avcodec_find_encoder_by_name("libopus");
+        if (!encoder) {
+            avformat_free_context(outFmt);
+            swr_free(&swr);
+            avcodec_free_context(&decCtx);
+            avformat_close_input(&inFmt);
+            return std::string("libopus encoder not available in this build");
+        }
     } else {
         encoder = avcodec_find_encoder(codec_id);
         if (!encoder) {
@@ -461,6 +203,15 @@ static std::string convertToFormat(const char* inputPath, const char* outputPath
     // Set sample rate from input/decoder if not already set
     encCtx->sample_rate = inStream->codecpar->sample_rate ? inStream->codecpar->sample_rate : decCtx->sample_rate;
+    // WAV output: force 16 kHz mono S16 (sherpa-onnx STT requirement)
+    if (isWav) {
+        encCtx->sample_rate = 16000;
+        encCtx->sample_fmt = AV_SAMPLE_FMT_S16;
+        av_channel_layout_uninit(&encCtx->ch_layout);
+        AVChannelLayout mono = AV_CHANNEL_LAYOUT_MONO;
+        av_channel_layout_copy(&encCtx->ch_layout, &mono);
+    }
     // Probe encoder-supported configurations (sample formats, sample rates, channel layouts)
     AVSampleFormat chosen_fmt = AV_SAMPLE_FMT_NONE;
     const void *fmt_configs = nullptr;
@@ -475,13 +226,8 @@ static std::string convertToFormat(const char* inputPath, const char* outputPath
     int chl_num = 0;
     avcodec_get_supported_config(encCtx, encoder, AV_CODEC_CONFIG_CHANNEL_LAYOUT, 0, &chl_configs, &chl_num);
-    // Log supported sample formats
     if (fmt_configs && fmt_num > 0) {
         const AVSampleFormat *fmts = (const AVSampleFormat *)fmt_configs;
-        for (int i = 0; i < fmt_num; ++i) {
-            const char *name = av_get_sample_fmt_name(fmts[i]);
-            LOGI("encoder supported fmt[%d]=%s", i, name ? name : "?");
-        }
         // prefer interleaved S16, then planar S16P, then decoder fmt, then first
         for (int i = 0; i < fmt_num; ++i) if (fmts[i] == AV_SAMPLE_FMT_S16) { chosen_fmt = AV_SAMPLE_FMT_S16; break; }
         if (chosen_fmt == AV_SAMPLE_FMT_NONE && codec_id == AV_CODEC_ID_MP3) {
@@ -492,7 +238,8 @@ static std::string convertToFormat(const char* inputPath, const char* outputPath
         }
         if (chosen_fmt == AV_SAMPLE_FMT_NONE && fmt_num > 0) chosen_fmt = fmts[0];
     } else {
-        // libshine only supports S16P; default to S16P for MP3 so open succeeds
+        // libshine only supports S16P; default to S16P for MP3 so open succeeds.
+        // If AAC, it might prefer FLTP, which `chosen_fmt = fmts[0]` captures above if available.
         chosen_fmt = (codec_id == AV_CODEC_ID_MP3) ? AV_SAMPLE_FMT_S16P : AV_SAMPLE_FMT_S16;
     }
     encCtx->sample_fmt = chosen_fmt;
@@ -502,7 +249,6 @@ static std::string convertToFormat(const char* inputPath, const char* outputPath
         const int *srs = (const int*)sr_configs;
         int pick_sr = 0;
         for (int i = 0; i < sr_num; ++i) {
-            LOGI("encoder supported sample_rate[%d]=%d", i, srs[i]);
             if (srs[i] == encCtx->sample_rate) { pick_sr = srs[i]; break; }
         }
         if (pick_sr == 0) pick_sr = srs[0];
@@ -511,10 +257,11 @@ static std::string convertToFormat(const char* inputPath, const char* outputPath
     // libshine only supports 32000, 44100, 48000 Hz. Use outputSampleRateHz if valid (32000/44100/48000), else default 44100.
     if (codec_id == AV_CODEC_ID_MP3) {
         int want = (outputSampleRateHz == 32000 || outputSampleRateHz == 44100 || outputSampleRateHz == 48000) ? outputSampleRateHz : 44100;
-        if (encCtx->sample_rate != want) {
-            LOGI("libshine: setting sample_rate %d (requested %d)", want, outputSampleRateHz);
-            encCtx->sample_rate = want;
-        }
+        if (encCtx->sample_rate != want) encCtx->sample_rate = want;
+    }
+    if (codec_id == AV_CODEC_ID_OPUS) {
+        int want = (outputSampleRateHz == 8000 || outputSampleRateHz == 12000 || outputSampleRateHz == 16000 || outputSampleRateHz == 24000 || outputSampleRateHz == 48000) ? outputSampleRateHz : 48000;
+        if (encCtx->sample_rate != want) encCtx->sample_rate = want;
     }
     // If supported channel layouts given, prefer matching channels else pick first
@@ -523,9 +270,6 @@ static std::string convertToFormat(const char* inputPath, const char* outputPath
         int pick_nb = 0;
         for (int i = 0; i < chl_num; ++i) {
             const AVChannelLayout *l = &layouts[i];
-            char buf[128];
-            av_channel_layout_describe(l, buf, sizeof(buf));
-            LOGI("encoder supported ch_layout[%d]=%s nb_channels=%d", i, buf, l->nb_channels);
             if (l->nb_channels == encCtx->ch_layout.nb_channels) { pick_nb = l->nb_channels; break; }
         }
         if (pick_nb == 0) pick_nb = layouts[0].nb_channels > 0 ? layouts[0].nb_channels : 1;
@@ -548,7 +292,7 @@ static std::string convertToFormat(const char* inputPath, const char* outputPath
     }
     // Set a sensible default bitrate for compressed codecs
-    if (codec_id == AV_CODEC_ID_MP3 || codec_id == AV_CODEC_ID_AAC) encCtx->bit_rate = 128000;
+    if (codec_id == AV_CODEC_ID_MP3 || codec_id == AV_CODEC_ID_AAC || codec_id == AV_CODEC_ID_OPUS) encCtx->bit_rate = 128000;
     else encCtx->bit_rate = 0; // lossless or PCM may ignore
     if (outFmt->oformat->flags & AVFMT_GLOBALHEADER) encCtx->flags |= AV_CODEC_FLAG_GLOBAL_HEADER;
@@ -614,10 +358,10 @@ static std::string convertToFormat(const char* inputPath, const char* outputPath
             }
         }
-        // Last resort: try S16 then S16P (for FLAC etc.)
+        // Last resort: try S16, S16P, then FLTP (for AAC etc.)
         if (ret < 0) {
-            AVSampleFormat fallbacks[] = { AV_SAMPLE_FMT_S16, AV_SAMPLE_FMT_S16P };
-            for (int fi = 0; fi < 2 && ret < 0; ++fi) {
+            AVSampleFormat fallbacks[] = { AV_SAMPLE_FMT_S16, AV_SAMPLE_FMT_S16P, AV_SAMPLE_FMT_FLTP };
+            for (int fi = 0; fi < 3 && ret < 0; ++fi) {
                 encCtx->sample_fmt = fallbacks[fi];
                 AVDictionary *try_opts = nullptr;
                 snprintf(tmpbuf, sizeof(tmpbuf), "%d", encCtx->ch_layout.nb_channels > 0 ? encCtx->ch_layout.nb_channels : 1);
@@ -742,73 +486,203 @@ static std::string convertToFormat(const char* inputPath, const char* outputPath
         avformat_close_input(&inFmt);
         return std::string("Failed to initialize resampler");
     }
+    {
+        int initRet = swr_init(swr);
+        if (initRet < 0) {
+            char errbuf[256];
+            av_strerror(initRet, errbuf, sizeof(errbuf));
+            LOGE("convertToFormat: swr_init failed: %s", errbuf);
+            av_channel_layout_uninit(&in_ch_layout2);
+            swr_free(&swr);
+            av_channel_layout_uninit(&resampled->ch_layout);
+            av_frame_free(&frame);
+            av_frame_free(&resampled);
+            av_packet_free(&pkt);
+            avcodec_free_context(&encCtx);
+            avformat_free_context(outFmt);
+            avcodec_free_context(&decCtx);
+            avformat_close_input(&inFmt);
+            return std::string("Failed to initialize resampler (swr_init)");
+        }
+    }
     av_channel_layout_uninit(&in_ch_layout2);
+    int totalDecodedFrames = 0;
+    int totalFramesSent = 0;
+    int totalPacketsFromEncoder = 0;
+    int flushPackets = 0;
+    int64_t encoder_pts = 0;
+    // Many encoders prefer / require a specific frame size (nb_samples) when using send_frame().
+    // MP3 (libshine) requires 1152 samples per frame.
+    // For others (e.g. FLAC), use encCtx->frame_size when available; otherwise use a conservative default.
+    const int default_frame_size = 1024;
+    const int enc_frame_size =
+        (codec_id == AV_CODEC_ID_MP3) ? 1152 :
+        (encCtx->frame_size > 0 ? encCtx->frame_size : default_frame_size);
+    int out_ch2 = encCtx->ch_layout.nb_channels;
+    if (out_ch2 <= 0) out_ch2 = 1;
+    int bytes_per_sample = av_get_bytes_per_sample(encCtx->sample_fmt);
+    // Accumulation buffer for resampled samples. Use read offset to avoid O(n²) memmove;
+    // compact only when offset exceeds threshold.
+    std::vector<uint8_t> accumBuf;
+    size_t accumReadOffset = 0;  // bytes consumed from start (avoids O(n²) memmove)
+    const int bytesPerFrame = bytes_per_sample * out_ch2;
+    int accumSamples = 0;
+    const size_t kCompactThreshold = 256 * 1024;  // compact when read offset exceeds 256 KB
+    auto maybeCompact = [&]() {
+        if (accumReadOffset == 0) return;
+        if (accumReadOffset < kCompactThreshold && accumReadOffset * 2 < accumBuf.size()) return;
+        size_t valid = accumBuf.size() - accumReadOffset;
+        if (valid > 0) memmove(accumBuf.data(), accumBuf.data() + accumReadOffset, valid);
+        accumBuf.resize(valid);
+        accumReadOffset = 0;
+    };
+    // Helper lambda: send exactly enc_frame_size samples from accumBuf to encoder
+    auto flushAccumFrames = [&](bool sendPartial) {
+        int needed = enc_frame_size;
+        if (needed <= 0) return;
+        while (accumSamples >= needed || (sendPartial && accumSamples > 0)) {
+            int toSend = (accumSamples >= needed) ? needed : accumSamples;
+            AVFrame* ef = av_frame_alloc();
+            if (!ef) break;
+            ef->format = encCtx->sample_fmt;
+            ef->sample_rate = encCtx->sample_rate;
+            if (av_channel_layout_copy(&ef->ch_layout, &encCtx->ch_layout) < 0) { av_frame_free(&ef); break; }
+            ef->nb_samples = toSend;
+            if (av_frame_get_buffer(ef, 0) < 0) { av_channel_layout_uninit(&ef->ch_layout); av_frame_free(&ef); break; }
+            int copyBytes = toSend * bytesPerFrame;
+            memcpy(ef->data[0], accumBuf.data() + accumReadOffset, copyBytes);
+            ef->pts = encoder_pts;
+            encoder_pts += toSend;
+            accumReadOffset += (size_t)copyBytes;
+            accumSamples -= toSend;
+            // Send to encoder with EAGAIN handling
+            for (;;) {
+                int ret = avcodec_send_frame(encCtx, ef);
+                if (ret == 0) break;
+                if (ret == AVERROR(EAGAIN)) {
+                    AVPacket* op = av_packet_alloc();
+                    while (avcodec_receive_packet(encCtx, op) == 0) {
+                        op->stream_index = outStream->index;
+                        av_packet_rescale_ts(op, encCtx->time_base, outStream->time_base);
+                        av_interleaved_write_frame(outFmt, op);
+                        av_packet_unref(op);
+                        totalPacketsFromEncoder++;
+                    }
+                    av_packet_free(&op);
+                    continue;
+                }
+                LOGW("convertToFormat: send_frame ret=%d frame=%d pts=%lld nb=%d", ret, totalFramesSent, (long long)ef->pts, toSend);
+                break;
+            }
+            // Drain any ready packets
+            AVPacket* op = av_packet_alloc();
+            while (avcodec_receive_packet(encCtx, op) == 0) {
+                op->stream_index = outStream->index;
+                av_packet_rescale_ts(op, encCtx->time_base, outStream->time_base);
+                av_interleaved_write_frame(outFmt, op);
+                av_packet_unref(op);
+                totalPacketsFromEncoder++;
+            }
+            av_packet_free(&op);
+            av_channel_layout_uninit(&ef->ch_layout);
+            av_frame_free(&ef);
+            totalFramesSent++;
+            if (!sendPartial && accumSamples < needed) break;
+        }
+    };
     while (av_read_frame(inFmt, pkt) >= 0) {
         if (pkt->stream_index == audioStreamIndex) {
             if (avcodec_send_packet(decCtx, pkt) == 0) {
                 while (avcodec_receive_frame(decCtx, frame) == 0) {
+                    totalDecodedFrames++;
                     int in_sr2 = inStream->codecpar->sample_rate ? inStream->codecpar->sample_rate : decCtx->sample_rate;
                     int64_t out_nb_samples = av_rescale_rnd(swr_get_delay(swr, in_sr2) + frame->nb_samples, encCtx->sample_rate, in_sr2, AV_ROUND_UP);
                     uint8_t** outData = nullptr;
-                    int out_ch2 = encCtx->ch_layout.nb_channels;
-                    if (out_ch2 <= 0) out_ch2 = 1;
                     if (av_samples_alloc_array_and_samples(&outData, nullptr, out_ch2, (int)out_nb_samples, encCtx->sample_fmt, 0) < 0) {
                         av_packet_unref(pkt);
                         continue;
                     }
                     int converted = swr_convert(swr, outData, (int)out_nb_samples, (const uint8_t**)frame->data, frame->nb_samples);
-                    if (converted < 0) {
+                    if (converted <= 0) {
                         av_freep(&outData[0]);
                         av_freep(&outData);
                         continue;
                     }
-                    resampled->nb_samples = converted;
-                    if (av_frame_get_buffer(resampled, 0) < 0) {
-                        av_freep(&outData[0]);
-                        av_freep(&outData);
-                        continue;
-                    }
-                    int bytes_per_sample = av_get_bytes_per_sample((AVSampleFormat)resampled->format);
-                    int copy_size2 = converted * bytes_per_sample * out_ch2;
-                    memcpy(resampled->data[0], outData[0], copy_size2);
-                    if (avcodec_send_frame(encCtx, resampled) == 0) {
-                        AVPacket* outPkt = av_packet_alloc();
-                        while (avcodec_receive_packet(encCtx, outPkt) == 0) {
-                            outPkt->stream_index = outStream->index;
-                            av_packet_rescale_ts(outPkt, encCtx->time_base, outStream->time_base);
-                            av_interleaved_write_frame(outFmt, outPkt);
-                            av_packet_unref(outPkt);
-                        }
-                        av_packet_free(&outPkt);
-                    }
+                    int newBytes = converted * bytes_per_sample * out_ch2;
+                    maybeCompact();
+                    size_t oldSize = accumBuf.size();
+                    accumBuf.resize(oldSize + (size_t)newBytes);
+                    memcpy(accumBuf.data() + oldSize, outData[0], (size_t)newBytes);
+                    accumSamples += converted;
                     av_freep(&outData[0]);
                     av_freep(&outData);
-                    av_frame_unref(resampled);
                     av_frame_unref(frame);
+                    flushAccumFrames(false);
                 }
             }
         }
         av_packet_unref(pkt);
     }
+    // Drain any remaining samples in swr (resampler delay)
+    {
+        uint8_t** tailData = nullptr;
+        int tailCap = swr_get_delay(swr, encCtx->sample_rate) + 256;
+        if (tailCap > 0 && av_samples_alloc_array_and_samples(&tailData, nullptr, out_ch2, tailCap, encCtx->sample_fmt, 0) >= 0) {
+            int tailConverted = swr_convert(swr, tailData, tailCap, nullptr, 0);
+            if (tailConverted > 0) {
+                int tailBytes = tailConverted * bytes_per_sample * out_ch2;
+                maybeCompact();
+                size_t oldSize = accumBuf.size();
+                accumBuf.resize(oldSize + (size_t)tailBytes);
+                memcpy(accumBuf.data() + oldSize, tailData[0], (size_t)tailBytes);
+                accumSamples += tailConverted;
+            }
+            av_freep(&tailData[0]);
+            av_freep(&tailData);
+        }
+    }
+    // Send remaining (partial) frames
+    flushAccumFrames(true);
+    (void)totalDecodedFrames; (void)totalPacketsFromEncoder;
     // Flush encoder
     avcodec_send_frame(encCtx, nullptr);
     AVPacket* outPkt2 = av_packet_alloc();
     while (avcodec_receive_packet(encCtx, outPkt2) == 0) {
+        flushPackets++;
         outPkt2->stream_index = outStream->index;
         av_packet_rescale_ts(outPkt2, encCtx->time_base, outStream->time_base);
         av_interleaved_write_frame(outFmt, outPkt2);
         av_packet_unref(outPkt2);
     }
     av_packet_free(&outPkt2);
+    (void)flushPackets;
     av_write_trailer(outFmt);
     if (!(outFmt->oformat->flags & AVFMT_NOFILE)) avio_closep(&outFmt->pb);
+    struct stat stOut = {};
+    long outputSizeBytes = (stat(outputPath, &stOut) == 0 && S_ISREG(stOut.st_mode)) ? (long)stOut.st_size : -1;
+    LOGI("convertToFormat: done outputPath=%s outputSizeBytes=%ld", outputPath ? outputPath : "(null)", outputSizeBytes);
     av_packet_free(&pkt);
     av_frame_free(&frame);
     av_channel_layout_uninit(&resampled->ch_layout);