npm - whisper.rn - Versions diffs - 0.4.0-rc.4 → 0.4.0-rc.6 - Mend

whisper.rn 0.4.0-rc.4 → 0.4.0-rc.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (49) hide show

package/README.md +6 -6
package/android/build.gradle +4 -0
package/android/src/main/CMakeLists.txt +5 -0
package/android/src/main/java/com/rnwhisper/AudioUtils.java +0 -80
package/android/src/main/java/com/rnwhisper/WhisperContext.java +57 -134
package/android/src/main/jni-utils.h +76 -0
package/android/src/main/jni.cpp +188 -112
package/cpp/README.md +1 -1
package/cpp/coreml/whisper-encoder-impl.h +1 -1
package/cpp/coreml/whisper-encoder.h +4 -0
package/cpp/coreml/whisper-encoder.mm +4 -2
package/cpp/ggml-alloc.c +55 -19
package/cpp/ggml-alloc.h +8 -1
package/cpp/ggml-backend-impl.h +46 -21
package/cpp/ggml-backend.c +563 -156
package/cpp/ggml-backend.h +62 -17
package/cpp/ggml-impl.h +1 -1
package/cpp/ggml-metal-whisper.metal +2444 -359
package/cpp/ggml-metal.h +7 -1
package/cpp/ggml-metal.m +1105 -197
package/cpp/ggml-quants.c +66 -61
package/cpp/ggml-quants.h +40 -40
package/cpp/ggml.c +1040 -1590
package/cpp/ggml.h +109 -30
package/cpp/rn-audioutils.cpp +68 -0
package/cpp/rn-audioutils.h +14 -0
package/cpp/rn-whisper-log.h +11 -0
package/cpp/rn-whisper.cpp +143 -59
package/cpp/rn-whisper.h +48 -15
package/cpp/whisper.cpp +1635 -928
package/cpp/whisper.h +55 -10
package/ios/RNWhisper.mm +7 -7
package/ios/RNWhisperAudioUtils.h +0 -2
package/ios/RNWhisperAudioUtils.m +0 -56
package/ios/RNWhisperContext.h +3 -11
package/ios/RNWhisperContext.mm +68 -137
package/lib/commonjs/index.js.map +1 -1
package/lib/commonjs/version.json +1 -1
package/lib/module/index.js.map +1 -1
package/lib/module/version.json +1 -1
package/lib/typescript/index.d.ts +5 -0
package/lib/typescript/index.d.ts.map +1 -1
package/package.json +6 -5
package/src/index.ts +5 -0
package/src/version.json +1 -1
package/ios/RNWhisper.xcodeproj/project.xcworkspace/contents.xcworkspacedata +0 -4
package/ios/RNWhisper.xcodeproj/project.xcworkspace/xcshareddata/IDEWorkspaceChecks.plist +0 -8
package/ios/RNWhisper.xcodeproj/project.xcworkspace/xcuserdata/jhen.xcuserdatad/UserInterfaceState.xcuserstate +0 -0
package/ios/RNWhisper.xcodeproj/xcuserdata/jhen.xcuserdatad/xcschemes/xcschememanagement.plist +0 -19

package/cpp/whisper.h CHANGED Viewed

@@ -1,6 +1,8 @@
 #ifndef WHISPER_H
 #define WHISPER_H
+#include "ggml.h"
 #include <stddef.h>
 #include <stdint.h>
 #include <stdbool.h>
@@ -48,7 +50,9 @@ extern "C" {
     //
     //     ...
     //
-    //     struct whisper_context * ctx = whisper_init_from_file("/path/to/ggml-base.en.bin");
+    //     whisper_context_params cparams = whisper_context_default_params();
+    //
+    //     struct whisper_context * ctx = whisper_init_from_file_with_params("/path/to/ggml-base.en.bin", cparams);
     //
     //     if (whisper_full(ctx, wparams, pcmf32.data(), pcmf32.size()) != 0) {
     //         fprintf(stderr, "failed to process audio\n");
@@ -76,7 +80,9 @@ extern "C" {
     struct whisper_state;
     struct whisper_full_params;
-    typedef int whisper_token;
+    typedef int32_t whisper_pos;
+    typedef int32_t whisper_token;
+    typedef int32_t whisper_seq_id;
     struct whisper_context_params {
         bool  use_gpu;
@@ -108,18 +114,49 @@ extern "C" {
         void  (*close)(void * ctx);
     } whisper_model_loader;
+    // grammar element type
+    enum whisper_gretype {
+        // end of rule definition
+        WHISPER_GRETYPE_END            = 0,
+        // start of alternate definition for rule
+        WHISPER_GRETYPE_ALT            = 1,
+        // non-terminal element: reference to rule
+        WHISPER_GRETYPE_RULE_REF       = 2,
+        // terminal element: character (code point)
+        WHISPER_GRETYPE_CHAR           = 3,
+        // inverse char(s) ([^a], [^a-b] [^abc])
+        WHISPER_GRETYPE_CHAR_NOT       = 4,
+        // modifies a preceding WHISPER_GRETYPE_CHAR or LLAMA_GRETYPE_CHAR_ALT to
+        // be an inclusive range ([a-z])
+        WHISPER_GRETYPE_CHAR_RNG_UPPER = 5,
+        // modifies a preceding WHISPER_GRETYPE_CHAR or
+        // WHISPER_GRETYPE_CHAR_RNG_UPPER to add an alternate char to match ([ab], [a-zA])
+        WHISPER_GRETYPE_CHAR_ALT       = 6,
+    };
+    typedef struct whisper_grammar_element {
+        enum whisper_gretype type;
+        uint32_t             value; // Unicode code point or rule ID
+    } whisper_grammar_element;
     // Various functions for loading a ggml whisper model.
     // Allocate (almost) all memory needed for the model.
     // Return NULL on failure
-    WHISPER_API struct whisper_context * whisper_init_from_file_with_params(const char * path_model, struct whisper_context_params params);
-    WHISPER_API struct whisper_context * whisper_init_from_buffer_with_params(void * buffer, size_t buffer_size, struct whisper_context_params params);
-    WHISPER_API struct whisper_context * whisper_init_with_params(struct whisper_model_loader * loader, struct whisper_context_params params);
+    WHISPER_API struct whisper_context * whisper_init_from_file_with_params  (const char * path_model,              struct whisper_context_params params);
+    WHISPER_API struct whisper_context * whisper_init_from_buffer_with_params(void * buffer, size_t buffer_size,    struct whisper_context_params params);
+    WHISPER_API struct whisper_context * whisper_init_with_params            (struct whisper_model_loader * loader, struct whisper_context_params params);
     // These are the same as the above, but the internal state of the context is not allocated automatically
     // It is the responsibility of the caller to allocate the state using whisper_init_state() (#523)
-    WHISPER_API struct whisper_context * whisper_init_from_file_with_params_no_state(const char * path_model, struct whisper_context_params params);
-    WHISPER_API struct whisper_context * whisper_init_from_buffer_with_params_no_state(void * buffer, size_t buffer_size, struct whisper_context_params params);
-    WHISPER_API struct whisper_context * whisper_init_with_params_no_state(struct whisper_model_loader * loader, struct whisper_context_params params);
+    WHISPER_API struct whisper_context * whisper_init_from_file_with_params_no_state  (const char * path_model,              struct whisper_context_params params);
+    WHISPER_API struct whisper_context * whisper_init_from_buffer_with_params_no_state(void * buffer, size_t buffer_size,    struct whisper_context_params params);
+    WHISPER_API struct whisper_context * whisper_init_with_params_no_state            (struct whisper_model_loader * loader, struct whisper_context_params params);
     WHISPER_DEPRECATED(
         WHISPER_API struct whisper_context * whisper_init_from_file(const char * path_model),
@@ -279,6 +316,9 @@ extern "C" {
     // Return the short string of the specified language id (e.g. 2 -> "de"), returns nullptr if not found
     WHISPER_API const char * whisper_lang_str(int id);
+    // Return the short string of the specified language name (e.g. 2 -> "german"), returns nullptr if not found
+    WHISPER_API const char * whisper_lang_str_full(int id);
     // Use mel data at offset_ms to try and auto-detect the spoken language
     // Make sure to call whisper_pcm_to_mel() or whisper_set_mel() first
     // Returns the top language id or negative on failure
@@ -401,6 +441,7 @@ extern "C" {
         bool translate;
         bool no_context;        // do not use past transcription (if any) as initial prompt for the decoder
+        bool no_timestamps;     // do not generate timestamps
         bool single_segment;    // force single segment output (useful for streaming)
         bool print_special;     // print special tokens (e.g. <SOT>, <EOT>, <BEG>, etc.)
         bool print_progress;    // print progress information
@@ -478,6 +519,11 @@ extern "C" {
         // called by each decoder to filter obtained logits
         whisper_logits_filter_callback logits_filter_callback;
         void * logits_filter_callback_user_data;
+        const whisper_grammar_element ** grammar_rules;
+        size_t                           n_grammar_rules;
+        size_t                           i_start_rule;
+        float                            grammar_penalty;
     };
     // NOTE: this function allocates memory, and it is the responsibility of the caller to free the pointer - see whisper_free_context_params & whisper_free_params()
@@ -571,8 +617,7 @@ extern "C" {
     // Control logging output; default behavior is to print to stderr
-    typedef void (*whisper_log_callback)(const char * line);
-    WHISPER_API void whisper_set_log_callback(whisper_log_callback callback);
+    WHISPER_API void whisper_log_set(wsp_ggml_log_callback log_callback, void * user_data);
 #ifdef __cplusplus
 }

package/ios/RNWhisper.mm CHANGED Viewed

@@ -142,9 +142,9 @@ RCT_REMAP_METHOD(transcribeFile,
         audioDataCount:count
         options:options
         onProgress: ^(int progress) {
-            if (rn_whisper_transcribe_is_aborted(jobId)) {
-                return;
-            }
+            rnwhisper::job* job = rnwhisper::job_get(jobId);
+            if (job && job->is_aborted()) return;
             dispatch_async(dispatch_get_main_queue(), ^{
                 [self sendEventWithName:@"@RNWhisper_onTranscribeProgress"
                     body:@{
@@ -156,9 +156,9 @@ RCT_REMAP_METHOD(transcribeFile,
             });
         }
         onNewSegments: ^(NSDictionary *result) {
-            if (rn_whisper_transcribe_is_aborted(jobId)) {
-                return;
-            }
+            rnwhisper::job* job = rnwhisper::job_get(jobId);
+            if (job && job->is_aborted()) return;
             dispatch_async(dispatch_get_main_queue(), ^{
                 [self sendEventWithName:@"@RNWhisper_onTranscribeNewSegments"
                     body:@{
@@ -279,7 +279,7 @@ RCT_REMAP_METHOD(releaseAllContexts,
         [context invalidate];
     }
-    rn_whisper_abort_all_transcribe(); // graceful abort
+    rnwhisper::job_abort_all(); // graceful abort
     [contexts removeAllObjects];
     contexts = nil;

package/ios/RNWhisperAudioUtils.h CHANGED Viewed

@@ -2,8 +2,6 @@
 @interface RNWhisperAudioUtils : NSObject
-+ (NSData *)concatShortBuffers:(NSMutableArray<NSValue *> *)buffers sliceNSamples:(NSMutableArray<NSNumber *> *)sliceNSamples;
-+ (void)saveWavFile:(NSData *)rawData audioOutputFile:(NSString *)audioOutputFile;
 + (float *)decodeWaveFile:(NSString*)filePath count:(int *)count;
 @end

package/ios/RNWhisperAudioUtils.m CHANGED Viewed

@@ -3,62 +3,6 @@
 @implementation RNWhisperAudioUtils
-+ (NSData *)concatShortBuffers:(NSMutableArray<NSValue *> *)buffers sliceNSamples:(NSMutableArray<NSNumber *> *)sliceNSamples {
-    NSMutableData *outputData = [NSMutableData data];
-    for (int i = 0; i < buffers.count; i++) {
-        int size = [sliceNSamples objectAtIndex:i].intValue;
-        NSValue *buffer = [buffers objectAtIndex:i];
-        short *bufferPtr = buffer.pointerValue;
-        [outputData appendBytes:bufferPtr length:size * sizeof(short)];
-    }
-    return outputData;
-}
-+ (void)saveWavFile:(NSData *)rawData audioOutputFile:(NSString *)audioOutputFile {
-    NSMutableData *outputData = [NSMutableData data];
-    // WAVE header
-    [outputData appendData:[@"RIFF" dataUsingEncoding:NSUTF8StringEncoding]]; // chunk id
-    int chunkSize = CFSwapInt32HostToLittle(36 + rawData.length);
-    [outputData appendBytes:&chunkSize length:sizeof(chunkSize)];
-    [outputData appendData:[@"WAVE" dataUsingEncoding:NSUTF8StringEncoding]]; // format
-    [outputData appendData:[@"fmt " dataUsingEncoding:NSUTF8StringEncoding]]; // subchunk 1 id
-    int subchunk1Size = CFSwapInt32HostToLittle(16);
-    [outputData appendBytes:&subchunk1Size length:sizeof(subchunk1Size)];
-    short audioFormat = CFSwapInt16HostToLittle(1); // PCM
-    [outputData appendBytes:&audioFormat length:sizeof(audioFormat)];
-    short numChannels = CFSwapInt16HostToLittle(1); // mono
-    [outputData appendBytes:&numChannels length:sizeof(numChannels)];
-    int sampleRate = CFSwapInt32HostToLittle(WHISPER_SAMPLE_RATE);
-    [outputData appendBytes:&sampleRate length:sizeof(sampleRate)];
-    // (bitDepth * sampleRate * channels) >> 3
-    int byteRate = CFSwapInt32HostToLittle(WHISPER_SAMPLE_RATE * 1 * 16 / 8);
-    [outputData appendBytes:&byteRate length:sizeof(byteRate)];
-    // (bitDepth * channels) >> 3
-    short blockAlign = CFSwapInt16HostToLittle(16 / 8);
-    [outputData appendBytes:&blockAlign length:sizeof(blockAlign)];
-    // bitDepth
-    short bitsPerSample = CFSwapInt16HostToLittle(16);
-    [outputData appendBytes:&bitsPerSample length:sizeof(bitsPerSample)];
-    [outputData appendData:[@"data" dataUsingEncoding:NSUTF8StringEncoding]]; // subchunk 2 id
-    int subchunk2Size = CFSwapInt32HostToLittle((int)rawData.length);
-    [outputData appendBytes:&subchunk2Size length:sizeof(subchunk2Size)];
-    // Audio data
-    [outputData appendData:rawData];
-    // Save to file
-    [outputData writeToFile:audioOutputFile atomically:YES];
-}
 + (float *)decodeWaveFile:(NSString*)filePath count:(int *)count {
     NSURL *url = [NSURL fileURLWithPath:filePath];
     NSData *fileData = [NSData dataWithContentsOfURL:url];

package/ios/RNWhisperContext.h CHANGED Viewed

@@ -11,29 +11,21 @@
 typedef struct {
     __unsafe_unretained id mSelf;
-    int jobId;
     NSDictionary* options;
+    struct rnwhisper::job * job;
     bool isTranscribing;
     bool isRealtime;
     bool isCapturing;
     bool isStoppedByAction;
-    int maxAudioSec;
     int nSamplesTranscribing;
-    NSMutableArray<NSValue *> *shortBufferSlices;
-    NSMutableArray<NSNumber *> *sliceNSamples;
+    std::vector<int> sliceNSamples;
     bool isUseSlices;
     int sliceIndex;
     int transcribeSliceIndex;
-    int audioSliceSec;
     NSString* audioOutputPath;
-    bool useVad;
-    int vadMs;
-    float vadThold;
-    float vadFreqThold;
     AudioQueueRef queue;
     AudioStreamBasicDescription dataFormat;
     AudioQueueBufferRef buffers[NUM_BUFFERS];

package/ios/RNWhisperContext.mm CHANGED Viewed

@@ -1,5 +1,4 @@
 #import "RNWhisperContext.h"
-#import "RNWhisperAudioUtils.h"
 #import <Metal/Metal.h>
 #include <vector>
@@ -95,7 +94,7 @@
     return self->dQueue;
 }
-- (void)prepareRealtime:(NSDictionary *)options {
+- (void)prepareRealtime:(int)jobId options:(NSDictionary *)options {
     self->recordState.options = options;
     self->recordState.dataFormat.mSampleRate = WHISPER_SAMPLE_RATE; // 16000
@@ -108,74 +107,39 @@
     self->recordState.dataFormat.mReserved = 0;
     self->recordState.dataFormat.mFormatFlags = kLinearPCMFormatFlagIsSignedInteger;
-    int maxAudioSecOpt = options[@"realtimeAudioSec"] != nil ? [options[@"realtimeAudioSec"] intValue] : 0;
-    int maxAudioSec = maxAudioSecOpt > 0 ? maxAudioSecOpt : DEFAULT_MAX_AUDIO_SEC;
-    self->recordState.maxAudioSec = maxAudioSec;
-    int realtimeAudioSliceSec = options[@"realtimeAudioSliceSec"] != nil ? [options[@"realtimeAudioSliceSec"] intValue] : 0;
-    int audioSliceSec = realtimeAudioSliceSec > 0 && realtimeAudioSliceSec < maxAudioSec ? realtimeAudioSliceSec : maxAudioSec;
-    self->recordState.audioOutputPath = options[@"audioOutputPath"];
-    self->recordState.useVad = options[@"useVad"] != nil ? [options[@"useVad"] boolValue] : false;
-    self->recordState.vadMs = options[@"vadMs"] != nil ? [options[@"vadMs"] intValue] : 2000;
-    if (self->recordState.vadMs < 2000) self->recordState.vadMs = 2000;
-    self->recordState.vadThold = options[@"vadThold"] != nil ? [options[@"vadThold"] floatValue] : 0.6f;
-    self->recordState.vadFreqThold = options[@"vadFreqThold"] != nil ? [options[@"vadFreqThold"] floatValue] : 100.0f;
-    self->recordState.audioSliceSec = audioSliceSec;
-    self->recordState.isUseSlices = audioSliceSec < maxAudioSec;
+    self->recordState.isRealtime = true;
+    self->recordState.isTranscribing = false;
+    self->recordState.isCapturing = false;
+    self->recordState.isStoppedByAction = false;
     self->recordState.sliceIndex = 0;
     self->recordState.transcribeSliceIndex = 0;
     self->recordState.nSamplesTranscribing = 0;
-    [self freeBufferIfNeeded];
-    self->recordState.shortBufferSlices = [NSMutableArray new];
-    int16_t *audioBufferI16 = (int16_t *) malloc(audioSliceSec * WHISPER_SAMPLE_RATE * sizeof(int16_t));
-    [self->recordState.shortBufferSlices addObject:[NSValue valueWithPointer:audioBufferI16]];
-    self->recordState.sliceNSamples = [NSMutableArray new];
-    [self->recordState.sliceNSamples addObject:[NSNumber numberWithInt:0]];
-    self->recordState.isRealtime = true;
-    self->recordState.isTranscribing = false;
-    self->recordState.isCapturing = false;
-    self->recordState.isStoppedByAction = false;
+    self->recordState.sliceNSamples.push_back(0);
+    self->recordState.job = rnwhisper::job_new(jobId, [self createParams:options jobId:jobId]);
+    self->recordState.job->set_realtime_params(
+        {
+            .use_vad = options[@"useVad"] != nil ? [options[@"useVad"] boolValue] : false,
+            .vad_ms = options[@"vadMs"] != nil ? [options[@"vadMs"] intValue] : 2000,
+            .vad_thold = options[@"vadThold"] != nil ? [options[@"vadThold"] floatValue] : 0.6f,
+            .freq_thold = options[@"vadFreqThold"] != nil ? [options[@"vadFreqThold"] floatValue] : 100.0f
+        },
+        options[@"realtimeAudioSec"] != nil ? [options[@"realtimeAudioSec"] intValue] : 0,
+        options[@"realtimeAudioSliceSec"] != nil ? [options[@"realtimeAudioSliceSec"] intValue] : 0,
+        options[@"realtimeAudioMinSec"] != nil ? [options[@"realtimeAudioMinSec"] floatValue] : 0,
+        options[@"audioOutputPath"] != nil ? [options[@"audioOutputPath"] UTF8String] : nullptr
+    );
+    self->recordState.isUseSlices = self->recordState.job->audio_slice_sec < self->recordState.job->audio_sec;
     self->recordState.mSelf = self;
 }
-- (void)freeBufferIfNeeded {
-    if (self->recordState.shortBufferSlices != nil) {
-        for (int i = 0; i < [self->recordState.shortBufferSlices count]; i++) {
-            int16_t *audioBufferI16 = (int16_t *) [self->recordState.shortBufferSlices[i] pointerValue];
-            free(audioBufferI16);
-        }
-        self->recordState.shortBufferSlices = nil;
-    }
-}
-bool vad(RNWhisperContextRecordState *state, int16_t* audioBufferI16, int nSamples, int n)
+bool vad(RNWhisperContextRecordState *state, int sliceIndex, int nSamples, int n)
 {
-    bool isSpeech = true;
-    if (!state->isTranscribing && state->useVad) {
-        int sampleSize = (int) (WHISPER_SAMPLE_RATE * state->vadMs / 1000);
-        if (nSamples + n > sampleSize) {
-            int start = nSamples + n - sampleSize;
-            std::vector<float> audioBufferF32Vec(sampleSize);
-            for (int i = 0; i < sampleSize; i++) {
-                audioBufferF32Vec[i] = (float)audioBufferI16[i + start] / 32768.0f;
-            }
-            isSpeech = rn_whisper_vad_simple(audioBufferF32Vec, WHISPER_SAMPLE_RATE, 1000, state->vadThold, state->vadFreqThold, false);
-            NSLog(@"[RNWhisper] VAD result: %d", isSpeech);
-        } else {
-            isSpeech = false;
-        }
-    }
-    return isSpeech;
+    if (state->isTranscribing) return true;
+    return state->job->vad_simple(sliceIndex, nSamples, n);
 }
 void AudioInputCallback(void * inUserData,
@@ -196,15 +160,15 @@ void AudioInputCallback(void * inUserData,
     }
     int totalNSamples = 0;
-    for (int i = 0; i < [state->sliceNSamples count]; i++) {
-        totalNSamples += [[state->sliceNSamples objectAtIndex:i] intValue];
+    for (int i = 0; i < state->sliceNSamples.size(); i++) {
+        totalNSamples += state->sliceNSamples[i];
     }
     const int n = inBuffer->mAudioDataByteSize / 2;
-    int nSamples = [state->sliceNSamples[state->sliceIndex] intValue];
+    int nSamples = state->sliceNSamples[state->sliceIndex];
-    if (totalNSamples + n > state->maxAudioSec * WHISPER_SAMPLE_RATE) {
+    if (totalNSamples + n > state->job->audio_sec * WHISPER_SAMPLE_RATE) {
         NSLog(@"[RNWhisper] Audio buffer is full, stop capturing");
         state->isCapturing = false;
         [state->mSelf stopAudio];
@@ -218,8 +182,8 @@ void AudioInputCallback(void * inUserData,
             !state->isTranscribing &&
             nSamples != state->nSamplesTranscribing
         ) {
-            int16_t* audioBufferI16 = (int16_t*) [state->shortBufferSlices[state->sliceIndex] pointerValue];
-            if (!vad(state, audioBufferI16, nSamples, 0)) {
+            bool isSamplesEnough = nSamples / WHISPER_SAMPLE_RATE >= state->job->audio_min_sec;
+            if (!isSamplesEnough || !vad(state, state->sliceIndex, nSamples, 0)) {
                 [state->mSelf finishRealtimeTranscribe:state result:@{}];
                 return;
             }
@@ -231,31 +195,25 @@ void AudioInputCallback(void * inUserData,
         return;
     }
-    int audioSliceSec = state->audioSliceSec;
-    if (nSamples + n > audioSliceSec * WHISPER_SAMPLE_RATE) {
+    if (nSamples + n > state->job->audio_slice_sec * WHISPER_SAMPLE_RATE) {
         // next slice
         state->sliceIndex++;
         nSamples = 0;
-        int16_t* audioBufferI16 = (int16_t*) malloc(audioSliceSec * WHISPER_SAMPLE_RATE * sizeof(int16_t));
-        [state->shortBufferSlices addObject:[NSValue valueWithPointer:audioBufferI16]];
-        [state->sliceNSamples addObject:[NSNumber numberWithInt:0]];
+        state->sliceNSamples.push_back(0);
     }
-    // Append to buffer
     NSLog(@"[RNWhisper] Slice %d has %d samples", state->sliceIndex, nSamples);
-    int16_t* audioBufferI16 = (int16_t*) [state->shortBufferSlices[state->sliceIndex] pointerValue];
-    for (int i = 0; i < n; i++) {
-        audioBufferI16[nSamples + i] = ((short*)inBuffer->mAudioData)[i];
-    }
+    state->job->put_pcm_data((short*) inBuffer->mAudioData, state->sliceIndex, nSamples, n);
-    bool isSpeech = vad(state, audioBufferI16, nSamples, n);
+    bool isSpeech = vad(state, state->sliceIndex, nSamples, n);
     nSamples += n;
-    state->sliceNSamples[state->sliceIndex] = [NSNumber numberWithInt:nSamples];
+    state->sliceNSamples[state->sliceIndex] = nSamples;
     AudioQueueEnqueueBuffer(state->queue, inBuffer, 0, NULL);
-    if (!isSpeech) return;
+    bool isSamplesEnough = nSamples / WHISPER_SAMPLE_RATE >= state->job->audio_min_sec;
+    if (!isSamplesEnough || !isSpeech) return;
     if (!state->isTranscribing) {
         state->isTranscribing = true;
@@ -267,32 +225,27 @@ void AudioInputCallback(void * inUserData,
 - (void)finishRealtimeTranscribe:(RNWhisperContextRecordState*) state result:(NSDictionary*)result {
     // Save wav if needed
-    if (state->audioOutputPath != nil) {
+    if (state->job->audio_output_path != nullptr) {
         // TODO: Append in real time so we don't need to keep all slices & also reduce memory usage
-        [RNWhisperAudioUtils
-            saveWavFile:[RNWhisperAudioUtils concatShortBuffers:state->shortBufferSlices
-                            sliceNSamples:state->sliceNSamples]
-            audioOutputFile:state->audioOutputPath
-        ];
+        rnaudioutils::save_wav_file(
+            rnaudioutils::concat_short_buffers(state->job->pcm_slices, state->sliceNSamples),
+            state->job->audio_output_path
+        );
     }
-    state->transcribeHandler(state->jobId, @"end", result);
+    state->transcribeHandler(state->job->job_id, @"end", result);
+    rnwhisper::job_remove(state->job->job_id);
 }
 - (void)fullTranscribeSamples:(RNWhisperContextRecordState*) state {
-    int nSamplesOfIndex = [[state->sliceNSamples objectAtIndex:state->transcribeSliceIndex] intValue];
+    int nSamplesOfIndex = state->sliceNSamples[state->transcribeSliceIndex];
     state->nSamplesTranscribing = nSamplesOfIndex;
     NSLog(@"[RNWhisper] Transcribing %d samples", state->nSamplesTranscribing);
-    int16_t* audioBufferI16 = (int16_t*) [state->shortBufferSlices[state->transcribeSliceIndex] pointerValue];
-    float* audioBufferF32 = (float*) malloc(state->nSamplesTranscribing * sizeof(float));
-    // convert I16 to F32
-    for (int i = 0; i < state->nSamplesTranscribing; i++) {
-        audioBufferF32[i] = (float)audioBufferI16[i] / 32768.0f;
-    }
+    float* pcmf32 = state->job->pcm_slice_to_f32(state->transcribeSliceIndex, state->nSamplesTranscribing);
     CFTimeInterval timeStart = CACurrentMediaTime();
-    struct whisper_full_params params = [state->mSelf getParams:state->options jobId:state->jobId];
-    int code = [state->mSelf fullTranscribe:state->jobId params:params audioData:audioBufferF32 audioDataCount:state->nSamplesTranscribing];
-    free(audioBufferF32);
+    int code = [state->mSelf fullTranscribe:state->job audioData:pcmf32 audioDataCount:state->nSamplesTranscribing];
+    free(pcmf32);
     CFTimeInterval timeEnd = CACurrentMediaTime();
     const float timeRecording = (float) state->nSamplesTranscribing / (float) state->dataFormat.mSampleRate;
@@ -312,7 +265,7 @@ void AudioInputCallback(void * inUserData,
         result[@"error"] = [NSString stringWithFormat:@"Transcribe failed with code %d", code];
     }
-    nSamplesOfIndex = [[state->sliceNSamples objectAtIndex:state->transcribeSliceIndex] intValue];
+    nSamplesOfIndex = state->sliceNSamples[state->transcribeSliceIndex];
     bool isStopped = state->isStoppedByAction || (
         !state->isCapturing &&
@@ -340,10 +293,10 @@ void AudioInputCallback(void * inUserData,
         [state->mSelf finishRealtimeTranscribe:state result:result];
     } else if (code == 0) {
         result[@"isCapturing"] = @(true);
-        state->transcribeHandler(state->jobId, @"transcribe", result);
+        state->transcribeHandler(state->job->job_id, @"transcribe", result);
     } else {
         result[@"isCapturing"] = @(true);
-        state->transcribeHandler(state->jobId, @"transcribe", result);
+        state->transcribeHandler(state->job->job_id, @"transcribe", result);
     }
     if (continueNeeded) {
@@ -371,8 +324,7 @@ void AudioInputCallback(void * inUserData,
     onTranscribe:(void (^)(int, NSString *, NSDictionary *))onTranscribe
 {
     self->recordState.transcribeHandler = onTranscribe;
-    self->recordState.jobId = jobId;
-    [self prepareRealtime:options];
+    [self prepareRealtime:jobId options:options];
     OSStatus status = AudioQueueNewInput(
         &self->recordState.dataFormat,
@@ -413,9 +365,9 @@ struct rnwhisper_segments_callback_data {
     dispatch_async(dQueue, ^{
         self->recordState.isStoppedByAction = false;
         self->recordState.isTranscribing = true;
-        self->recordState.jobId = jobId;
-        whisper_full_params params = [self getParams:options jobId:jobId];
+        whisper_full_params params = [self createParams:options jobId:jobId];
         if (options[@"onProgress"] && [options[@"onProgress"] boolValue]) {
             params.progress_callback = [](struct whisper_context * /*ctx*/, struct whisper_state * /*state*/, int progress, void * user_data) {
                 void (^onProgress)(int) = (__bridge void (^)(int))user_data;
@@ -460,8 +412,10 @@ struct rnwhisper_segments_callback_data {
             };
             params.new_segment_callback_user_data = &user_data;
         }
-        int code = [self fullTranscribe:jobId params:params audioData:audioData audioDataCount:audioDataCount];
-        self->recordState.jobId = -1;
+        rnwhisper::job* job = rnwhisper::job_new(jobId, params);;
+        int code = [self fullTranscribe:job audioData:audioData audioDataCount:audioDataCount];
+        rnwhisper::job_remove(jobId);
         self->recordState.isTranscribing = false;
         onEnd(code);
     });
@@ -476,7 +430,7 @@ struct rnwhisper_segments_callback_data {
 }
 - (void)stopTranscribe:(int)jobId {
-    rn_whisper_abort_transcribe(jobId);
+    if (self->recordState.job) self->recordState.job->abort();
     if (self->recordState.isRealtime && self->recordState.isCapturing) {
         [self stopAudio];
         if (!self->recordState.isTranscribing) {
@@ -490,13 +444,11 @@ struct rnwhisper_segments_callback_data {
 }
 - (void)stopCurrentTranscribe {
-    if (!self->recordState.jobId) {
-        return;
-    }
-    [self stopTranscribe:self->recordState.jobId];
+    if (self->recordState.job == nullptr) return;
+    [self stopTranscribe:self->recordState.job->job_id];
 }
-- (struct whisper_full_params)getParams:(NSDictionary *)options jobId:(int)jobId {
+- (struct whisper_full_params)createParams:(NSDictionary *)options jobId:(int)jobId {
     struct whisper_full_params params = whisper_full_default_params(WHISPER_SAMPLING_GREEDY);
     const int n_threads = options[@"maxThreads"] != nil ?
@@ -517,7 +469,7 @@ struct rnwhisper_segments_callback_data {
     params.print_special    = false;
     params.speed_up         = options[@"speedUp"] != nil ? [options[@"speedUp"] boolValue] : false;
     params.translate        = options[@"translate"] != nil ? [options[@"translate"] boolValue] : false;
-    params.language         = options[@"language"] != nil ? [options[@"language"] UTF8String] : "auto";
+    params.language         = options[@"language"] != nil ? strdup([options[@"language"] UTF8String]) : "auto";
     params.n_threads        = n_threads > 0 ? n_threads : default_n_threads;
     params.offset_ms        = 0;
     params.no_context       = true;
@@ -534,7 +486,6 @@ struct rnwhisper_segments_callback_data {
     if (options[@"maxContext"] != nil) {
         params.n_max_text_ctx = [options[@"maxContext"] intValue];
     }
     if (options[@"offset"] != nil) {
         params.offset_ms = [options[@"offset"] intValue];
     }
@@ -550,39 +501,20 @@ struct rnwhisper_segments_callback_data {
     if (options[@"temperatureInc"] != nil) {
         params.temperature_inc = [options[@"temperature_inc"] floatValue];
     }
     if (options[@"prompt"] != nil) {
-        params.initial_prompt = [options[@"prompt"] UTF8String];
+        params.initial_prompt = strdup([options[@"prompt"] UTF8String]);
     }
-    // abort handler
-    bool *abort_ptr = rn_whisper_assign_abort_map(jobId);
-    params.encoder_begin_callback = [](struct whisper_context * /*ctx*/, struct whisper_state * /*state*/, void * user_data) {
-        bool is_aborted = *(bool*)user_data;
-        return !is_aborted;
-    };
-    params.encoder_begin_callback_user_data = abort_ptr;
-    params.abort_callback = [](void * user_data) {
-        bool is_aborted = *(bool*)user_data;
-        return is_aborted;
-    };
-    params.abort_callback_user_data = abort_ptr;
     return params;
 }
-- (int)fullTranscribe:(int)jobId
-  params:(struct whisper_full_params)params
+- (int)fullTranscribe:(rnwhisper::job *)job
   audioData:(float *)audioData
   audioDataCount:(int)audioDataCount
 {
     whisper_reset_timings(self->ctx);
-    int code = whisper_full(self->ctx, params, audioData, audioDataCount);
-    if (rn_whisper_transcribe_is_aborted(jobId)) {
-        code = -999;
-    }
-    rn_whisper_remove_abort_map(jobId);
+    int code = whisper_full(self->ctx, job->params, audioData, audioDataCount);
+    if (job && job->is_aborted()) code = -999;
     // if (code == 0) {
     //     whisper_print_timings(self->ctx);
     // }
@@ -616,7 +548,6 @@ struct rnwhisper_segments_callback_data {
 - (void)invalidate {
     [self stopCurrentTranscribe];
     whisper_free(self->ctx);
-    [self freeBufferIfNeeded];
 }
 @end