npm - whisper.rn - Versions diffs - 0.4.0-rc.3 → 0.4.0-rc.5 - Mend

whisper.rn 0.4.0-rc.3 → 0.4.0-rc.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (59) hide show

package/README.md +6 -6
package/android/build.gradle +4 -0
package/android/src/main/CMakeLists.txt +7 -0
package/android/src/main/java/com/rnwhisper/AudioUtils.java +0 -80
package/android/src/main/java/com/rnwhisper/RNWhisper.java +6 -1
package/android/src/main/java/com/rnwhisper/WhisperContext.java +53 -135
package/android/src/main/jni-utils.h +76 -0
package/android/src/main/jni.cpp +188 -109
package/cpp/README.md +1 -1
package/cpp/coreml/whisper-encoder-impl.h +1 -1
package/cpp/coreml/whisper-encoder.h +4 -0
package/cpp/coreml/whisper-encoder.mm +4 -2
package/cpp/ggml-alloc.c +451 -282
package/cpp/ggml-alloc.h +74 -8
package/cpp/ggml-backend-impl.h +112 -0
package/cpp/ggml-backend.c +1357 -0
package/cpp/ggml-backend.h +181 -0
package/cpp/ggml-impl.h +243 -0
package/cpp/{ggml-metal.metal → ggml-metal-whisper.metal} +1556 -329
package/cpp/ggml-metal.h +28 -1
package/cpp/ggml-metal.m +1128 -308
package/cpp/ggml-quants.c +7382 -0
package/cpp/ggml-quants.h +224 -0
package/cpp/ggml.c +3848 -5245
package/cpp/ggml.h +353 -155
package/cpp/rn-audioutils.cpp +68 -0
package/cpp/rn-audioutils.h +14 -0
package/cpp/rn-whisper-log.h +11 -0
package/cpp/rn-whisper.cpp +141 -59
package/cpp/rn-whisper.h +47 -15
package/cpp/whisper.cpp +1750 -964
package/cpp/whisper.h +97 -15
package/ios/RNWhisper.mm +15 -9
package/ios/RNWhisper.xcodeproj/project.xcworkspace/contents.xcworkspacedata +4 -0
package/ios/RNWhisper.xcodeproj/project.xcworkspace/xcshareddata/IDEWorkspaceChecks.plist +8 -0
package/ios/RNWhisper.xcodeproj/project.xcworkspace/xcuserdata/jhen.xcuserdatad/UserInterfaceState.xcuserstate +0 -0
package/ios/RNWhisper.xcodeproj/xcuserdata/jhen.xcuserdatad/xcschemes/xcschememanagement.plist +19 -0
package/ios/RNWhisperAudioUtils.h +0 -2
package/ios/RNWhisperAudioUtils.m +0 -56
package/ios/RNWhisperContext.h +8 -12
package/ios/RNWhisperContext.mm +132 -138
package/jest/mock.js +1 -1
package/lib/commonjs/NativeRNWhisper.js.map +1 -1
package/lib/commonjs/index.js +28 -9
package/lib/commonjs/index.js.map +1 -1
package/lib/commonjs/version.json +1 -1
package/lib/module/NativeRNWhisper.js.map +1 -1
package/lib/module/index.js +28 -9
package/lib/module/index.js.map +1 -1
package/lib/module/version.json +1 -1
package/lib/typescript/NativeRNWhisper.d.ts +7 -1
package/lib/typescript/NativeRNWhisper.d.ts.map +1 -1
package/lib/typescript/index.d.ts +7 -2
package/lib/typescript/index.d.ts.map +1 -1
package/package.json +6 -5
package/src/NativeRNWhisper.ts +8 -1
package/src/index.ts +29 -17
package/src/version.json +1 -1
package/whisper-rn.podspec +1 -2

package/cpp/whisper.h CHANGED Viewed

@@ -1,10 +1,20 @@
 #ifndef WHISPER_H
 #define WHISPER_H
+#include "ggml.h"
 #include <stddef.h>
 #include <stdint.h>
 #include <stdbool.h>
+#ifdef __GNUC__
+#    define WHISPER_DEPRECATED(func, hint) func __attribute__((deprecated(hint)))
+#elif defined(_MSC_VER)
+#    define WHISPER_DEPRECATED(func, hint) __declspec(deprecated(hint)) func
+#else
+#    define WHISPER_DEPRECATED(func, hint) func
+#endif
 #ifdef WHISPER_SHARED
 #    ifdef _WIN32
 #        ifdef WHISPER_BUILD
@@ -21,7 +31,6 @@
 #define WHISPER_SAMPLE_RATE 16000
 #define WHISPER_N_FFT       400
-#define WHISPER_N_MEL       80
 #define WHISPER_HOP_LENGTH  160
 #define WHISPER_CHUNK_SIZE  30
@@ -41,7 +50,9 @@ extern "C" {
     //
     //     ...
     //
-    //     struct whisper_context * ctx = whisper_init_from_file("/path/to/ggml-base.en.bin");
+    //     whisper_context_params cparams = whisper_context_default_params();
+    //
+    //     struct whisper_context * ctx = whisper_init_from_file_with_params("/path/to/ggml-base.en.bin", cparams);
     //
     //     if (whisper_full(ctx, wparams, pcmf32.data(), pcmf32.size()) != 0) {
     //         fprintf(stderr, "failed to process audio\n");
@@ -69,7 +80,14 @@ extern "C" {
     struct whisper_state;
     struct whisper_full_params;
-    typedef int whisper_token;
+    typedef int32_t whisper_pos;
+    typedef int32_t whisper_token;
+    typedef int32_t whisper_seq_id;
+    struct whisper_context_params {
+        bool  use_gpu;
+        bool  use_coreml;
+    };
     typedef struct whisper_token_data {
         whisper_token id;  // token id
@@ -96,21 +114,74 @@ extern "C" {
         void  (*close)(void * ctx);
     } whisper_model_loader;
+    // grammar element type
+    enum whisper_gretype {
+        // end of rule definition
+        WHISPER_GRETYPE_END            = 0,
+        // start of alternate definition for rule
+        WHISPER_GRETYPE_ALT            = 1,
+        // non-terminal element: reference to rule
+        WHISPER_GRETYPE_RULE_REF       = 2,
+        // terminal element: character (code point)
+        WHISPER_GRETYPE_CHAR           = 3,
+        // inverse char(s) ([^a], [^a-b] [^abc])
+        WHISPER_GRETYPE_CHAR_NOT       = 4,
+        // modifies a preceding WHISPER_GRETYPE_CHAR or LLAMA_GRETYPE_CHAR_ALT to
+        // be an inclusive range ([a-z])
+        WHISPER_GRETYPE_CHAR_RNG_UPPER = 5,
+        // modifies a preceding WHISPER_GRETYPE_CHAR or
+        // WHISPER_GRETYPE_CHAR_RNG_UPPER to add an alternate char to match ([ab], [a-zA])
+        WHISPER_GRETYPE_CHAR_ALT       = 6,
+    };
+    typedef struct whisper_grammar_element {
+        enum whisper_gretype type;
+        uint32_t             value; // Unicode code point or rule ID
+    } whisper_grammar_element;
     // Various functions for loading a ggml whisper model.
     // Allocate (almost) all memory needed for the model.
     // Return NULL on failure
-#ifdef WHISPER_USE_COREML
-    WHISPER_API struct whisper_context * whisper_init_from_file_no_coreml(const char * path_model);
-#endif
-    WHISPER_API struct whisper_context * whisper_init_from_file(const char * path_model);
-    WHISPER_API struct whisper_context * whisper_init_from_buffer(void * buffer, size_t buffer_size);
-    WHISPER_API struct whisper_context * whisper_init(struct whisper_model_loader * loader);
+    WHISPER_API struct whisper_context * whisper_init_from_file_with_params  (const char * path_model,              struct whisper_context_params params);
+    WHISPER_API struct whisper_context * whisper_init_from_buffer_with_params(void * buffer, size_t buffer_size,    struct whisper_context_params params);
+    WHISPER_API struct whisper_context * whisper_init_with_params            (struct whisper_model_loader * loader, struct whisper_context_params params);
     // These are the same as the above, but the internal state of the context is not allocated automatically
     // It is the responsibility of the caller to allocate the state using whisper_init_state() (#523)
-    WHISPER_API struct whisper_context * whisper_init_from_file_no_state(const char * path_model);
-    WHISPER_API struct whisper_context * whisper_init_from_buffer_no_state(void * buffer, size_t buffer_size);
-    WHISPER_API struct whisper_context * whisper_init_no_state(struct whisper_model_loader * loader);
+    WHISPER_API struct whisper_context * whisper_init_from_file_with_params_no_state  (const char * path_model,              struct whisper_context_params params);
+    WHISPER_API struct whisper_context * whisper_init_from_buffer_with_params_no_state(void * buffer, size_t buffer_size,    struct whisper_context_params params);
+    WHISPER_API struct whisper_context * whisper_init_with_params_no_state            (struct whisper_model_loader * loader, struct whisper_context_params params);
+    WHISPER_DEPRECATED(
+        WHISPER_API struct whisper_context * whisper_init_from_file(const char * path_model),
+        "use whisper_init_from_file_with_params instead"
+    );
+    WHISPER_DEPRECATED(
+        WHISPER_API struct whisper_context * whisper_init_from_buffer(void * buffer, size_t buffer_size),
+        "use whisper_init_from_buffer_with_params instead"
+    );
+    WHISPER_DEPRECATED(
+        WHISPER_API struct whisper_context * whisper_init(struct whisper_model_loader * loader),
+        "use whisper_init_with_params instead"
+    );
+    WHISPER_DEPRECATED(
+        WHISPER_API struct whisper_context * whisper_init_from_file_no_state(const char * path_model),
+        "use whisper_init_from_file_with_params_no_state instead"
+    );
+    WHISPER_DEPRECATED(
+        WHISPER_API struct whisper_context * whisper_init_from_buffer_no_state(void * buffer, size_t buffer_size),
+        "use whisper_init_from_buffer_with_params_no_state instead"
+    );
+    WHISPER_DEPRECATED(
+        WHISPER_API struct whisper_context * whisper_init_no_state(struct whisper_model_loader * loader),
+        "use whisper_init_with_params_no_state instead"
+    );
     WHISPER_API struct whisper_state * whisper_init_state(struct whisper_context * ctx);
@@ -135,6 +206,7 @@ extern "C" {
     WHISPER_API void whisper_free      (struct whisper_context * ctx);
     WHISPER_API void whisper_free_state(struct whisper_state * state);
     WHISPER_API void whisper_free_params(struct whisper_full_params * params);
+    WHISPER_API void whisper_free_context_params(struct whisper_context_params * params);
     // Convert RAW PCM audio to log mel spectrogram.
     // The resulting spectrogram is stored inside the default state of the provided whisper context.
@@ -244,6 +316,9 @@ extern "C" {
     // Return the short string of the specified language id (e.g. 2 -> "de"), returns nullptr if not found
     WHISPER_API const char * whisper_lang_str(int id);
+    // Return the short string of the specified language name (e.g. 2 -> "german"), returns nullptr if not found
+    WHISPER_API const char * whisper_lang_str_full(int id);
     // Use mel data at offset_ms to try and auto-detect the spoken language
     // Make sure to call whisper_pcm_to_mel() or whisper_set_mel() first
     // Returns the top language id or negative on failure
@@ -366,6 +441,7 @@ extern "C" {
         bool translate;
         bool no_context;        // do not use past transcription (if any) as initial prompt for the decoder
+        bool no_timestamps;     // do not generate timestamps
         bool single_segment;    // force single segment output (useful for streaming)
         bool print_special;     // print special tokens (e.g. <SOT>, <EOT>, <BEG>, etc.)
         bool print_progress;    // print progress information
@@ -443,9 +519,16 @@ extern "C" {
         // called by each decoder to filter obtained logits
         whisper_logits_filter_callback logits_filter_callback;
         void * logits_filter_callback_user_data;
+        const whisper_grammar_element ** grammar_rules;
+        size_t                           n_grammar_rules;
+        size_t                           i_start_rule;
+        float                            grammar_penalty;
     };
-    // NOTE: this function allocates memory, and it is the responsibility of the caller to free the pointer - see whisper_free_params()
+    // NOTE: this function allocates memory, and it is the responsibility of the caller to free the pointer - see whisper_free_context_params & whisper_free_params()
+    WHISPER_API struct whisper_context_params * whisper_context_default_params_by_ref();
+    WHISPER_API struct whisper_context_params whisper_context_default_params(void);
     WHISPER_API struct whisper_full_params * whisper_full_default_params_by_ref(enum whisper_sampling_strategy strategy);
     WHISPER_API struct whisper_full_params whisper_full_default_params(enum whisper_sampling_strategy strategy);
@@ -534,8 +617,7 @@ extern "C" {
     // Control logging output; default behavior is to print to stderr
-    typedef void (*whisper_log_callback)(const char * line);
-    WHISPER_API void whisper_set_log_callback(whisper_log_callback callback);
+    WHISPER_API void whisper_log_set(wsp_ggml_log_callback log_callback, void * user_data);
 #ifdef __cplusplus
 }

package/ios/RNWhisper.mm CHANGED Viewed

@@ -48,6 +48,7 @@ RCT_REMAP_METHOD(initContext,
     NSString *modelPath = [modelOptions objectForKey:@"filePath"];
     BOOL isBundleAsset = [[modelOptions objectForKey:@"isBundleAsset"] boolValue];
+    BOOL useGpu = [[modelOptions objectForKey:@"useGpu"] boolValue];
     BOOL useCoreMLIos = [[modelOptions objectForKey:@"useCoreMLIos"] boolValue];
     // For support debug assets in development mode
@@ -77,6 +78,7 @@ RCT_REMAP_METHOD(initContext,
         initWithModelPath:path
         contextId:contextId
         noCoreML:!useCoreMLIos
+        noMetal:!useGpu
     ];
     if ([context getContext] == NULL) {
         reject(@"whisper_cpp_error", @"Failed to load the model", nil);
@@ -85,7 +87,11 @@ RCT_REMAP_METHOD(initContext,
     [contexts setObject:context forKey:[NSNumber numberWithInt:contextId]];
-    resolve([NSNumber numberWithInt:contextId]);
+    resolve(@{
+        @"contextId": @(contextId),
+        @"gpu": @([context isMetalEnabled]),
+        @"reasonNoGPU": [context reasonNoMetal],
+    });
 }
 - (NSArray *)supportedEvents {
@@ -136,9 +142,9 @@ RCT_REMAP_METHOD(transcribeFile,
         audioDataCount:count
         options:options
         onProgress: ^(int progress) {
-            if (rn_whisper_transcribe_is_aborted(jobId)) {
-                return;
-            }
+            rnwhisper::job* job = rnwhisper::job_get(jobId);
+            if (job && job->is_aborted()) return;
             dispatch_async(dispatch_get_main_queue(), ^{
                 [self sendEventWithName:@"@RNWhisper_onTranscribeProgress"
                     body:@{
@@ -150,9 +156,9 @@ RCT_REMAP_METHOD(transcribeFile,
             });
         }
         onNewSegments: ^(NSDictionary *result) {
-            if (rn_whisper_transcribe_is_aborted(jobId)) {
-                return;
-            }
+            rnwhisper::job* job = rnwhisper::job_get(jobId);
+            if (job && job->is_aborted()) return;
             dispatch_async(dispatch_get_main_queue(), ^{
                 [self sendEventWithName:@"@RNWhisper_onTranscribeNewSegments"
                     body:@{
@@ -164,7 +170,7 @@ RCT_REMAP_METHOD(transcribeFile,
             });
         }
         onEnd: ^(int code) {
-            if (code != 0) {
+            if (code != 0 && code != 999) {
                 free(waveFile);
                 reject(@"whisper_cpp_error", [NSString stringWithFormat:@"Failed to transcribe the file. Code: %d", code], nil);
                 return;
@@ -273,7 +279,7 @@ RCT_REMAP_METHOD(releaseAllContexts,
         [context invalidate];
     }
-    rn_whisper_abort_all_transcribe(); // graceful abort
+    rnwhisper::job_abort_all(); // graceful abort
     [contexts removeAllObjects];
     contexts = nil;

package/ios/RNWhisper.xcodeproj/project.xcworkspace/contents.xcworkspacedata ADDED Viewed

@@ -0,0 +1,4 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<Workspace
+   version = "1.0">
+</Workspace>

package/ios/RNWhisper.xcodeproj/project.xcworkspace/xcshareddata/IDEWorkspaceChecks.plist ADDED Viewed

@@ -0,0 +1,8 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
+<plist version="1.0">
+<dict>
+	<key>IDEDidComputeMac32BitWarning</key>
+	<true/>
+</dict>
+</plist>

package/ios/RNWhisper.xcodeproj/project.xcworkspace/xcuserdata/jhen.xcuserdatad/UserInterfaceState.xcuserstate ADDED Viewed

Binary file

package/ios/RNWhisper.xcodeproj/xcuserdata/jhen.xcuserdatad/xcschemes/xcschememanagement.plist ADDED Viewed

@@ -0,0 +1,19 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
+<plist version="1.0">
+<dict>
+	<key>SchemeUserState</key>
+	<dict>
+		<key>RNWhisper.xcscheme_^#shared#^_</key>
+		<dict>
+			<key>orderHint</key>
+			<integer>0</integer>
+		</dict>
+		<key>WhisperCpp.xcscheme_^#shared#^_</key>
+		<dict>
+			<key>orderHint</key>
+			<integer>0</integer>
+		</dict>
+	</dict>
+</dict>
+</plist>

package/ios/RNWhisperAudioUtils.h CHANGED Viewed

@@ -2,8 +2,6 @@
 @interface RNWhisperAudioUtils : NSObject
-+ (NSData *)concatShortBuffers:(NSMutableArray<NSValue *> *)buffers sliceNSamples:(NSMutableArray<NSNumber *> *)sliceNSamples;
-+ (void)saveWavFile:(NSData *)rawData audioOutputFile:(NSString *)audioOutputFile;
 + (float *)decodeWaveFile:(NSString*)filePath count:(int *)count;
 @end

package/ios/RNWhisperAudioUtils.m CHANGED Viewed

@@ -3,62 +3,6 @@
 @implementation RNWhisperAudioUtils
-+ (NSData *)concatShortBuffers:(NSMutableArray<NSValue *> *)buffers sliceNSamples:(NSMutableArray<NSNumber *> *)sliceNSamples {
-    NSMutableData *outputData = [NSMutableData data];
-    for (int i = 0; i < buffers.count; i++) {
-        int size = [sliceNSamples objectAtIndex:i].intValue;
-        NSValue *buffer = [buffers objectAtIndex:i];
-        short *bufferPtr = buffer.pointerValue;
-        [outputData appendBytes:bufferPtr length:size * sizeof(short)];
-    }
-    return outputData;
-}
-+ (void)saveWavFile:(NSData *)rawData audioOutputFile:(NSString *)audioOutputFile {
-    NSMutableData *outputData = [NSMutableData data];
-    // WAVE header
-    [outputData appendData:[@"RIFF" dataUsingEncoding:NSUTF8StringEncoding]]; // chunk id
-    int chunkSize = CFSwapInt32HostToLittle(36 + rawData.length);
-    [outputData appendBytes:&chunkSize length:sizeof(chunkSize)];
-    [outputData appendData:[@"WAVE" dataUsingEncoding:NSUTF8StringEncoding]]; // format
-    [outputData appendData:[@"fmt " dataUsingEncoding:NSUTF8StringEncoding]]; // subchunk 1 id
-    int subchunk1Size = CFSwapInt32HostToLittle(16);
-    [outputData appendBytes:&subchunk1Size length:sizeof(subchunk1Size)];
-    short audioFormat = CFSwapInt16HostToLittle(1); // PCM
-    [outputData appendBytes:&audioFormat length:sizeof(audioFormat)];
-    short numChannels = CFSwapInt16HostToLittle(1); // mono
-    [outputData appendBytes:&numChannels length:sizeof(numChannels)];
-    int sampleRate = CFSwapInt32HostToLittle(WHISPER_SAMPLE_RATE);
-    [outputData appendBytes:&sampleRate length:sizeof(sampleRate)];
-    // (bitDepth * sampleRate * channels) >> 3
-    int byteRate = CFSwapInt32HostToLittle(WHISPER_SAMPLE_RATE * 1 * 16 / 8);
-    [outputData appendBytes:&byteRate length:sizeof(byteRate)];
-    // (bitDepth * channels) >> 3
-    short blockAlign = CFSwapInt16HostToLittle(16 / 8);
-    [outputData appendBytes:&blockAlign length:sizeof(blockAlign)];
-    // bitDepth
-    short bitsPerSample = CFSwapInt16HostToLittle(16);
-    [outputData appendBytes:&bitsPerSample length:sizeof(bitsPerSample)];
-    [outputData appendData:[@"data" dataUsingEncoding:NSUTF8StringEncoding]]; // subchunk 2 id
-    int subchunk2Size = CFSwapInt32HostToLittle((int)rawData.length);
-    [outputData appendBytes:&subchunk2Size length:sizeof(subchunk2Size)];
-    // Audio data
-    [outputData appendData:rawData];
-    // Save to file
-    [outputData writeToFile:audioOutputFile atomically:YES];
-}
 + (float *)decodeWaveFile:(NSString*)filePath count:(int *)count {
     NSURL *url = [NSURL fileURLWithPath:filePath];
     NSData *fileData = [NSData dataWithContentsOfURL:url];

package/ios/RNWhisperContext.h CHANGED Viewed

@@ -11,29 +11,21 @@
 typedef struct {
     __unsafe_unretained id mSelf;
-    int jobId;
     NSDictionary* options;
+    struct rnwhisper::job * job;
     bool isTranscribing;
     bool isRealtime;
     bool isCapturing;
     bool isStoppedByAction;
-    int maxAudioSec;
     int nSamplesTranscribing;
-    NSMutableArray<NSValue *> *shortBufferSlices;
-    NSMutableArray<NSNumber *> *sliceNSamples;
+    std::vector<int> sliceNSamples;
     bool isUseSlices;
     int sliceIndex;
     int transcribeSliceIndex;
-    int audioSliceSec;
     NSString* audioOutputPath;
-    bool useVad;
-    int vadMs;
-    float vadThold;
-    float vadFreqThold;
     AudioQueueRef queue;
     AudioStreamBasicDescription dataFormat;
     AudioQueueBufferRef buffers[NUM_BUFFERS];
@@ -46,9 +38,13 @@ typedef struct {
     dispatch_queue_t dQueue;
     struct whisper_context * ctx;
     RNWhisperContextRecordState recordState;
+    NSString * reasonNoMetal;
+    bool isMetalEnabled;
 }
-+ (instancetype)initWithModelPath:(NSString *)modelPath contextId:(int)contextId noCoreML:(BOOL)noCoreML;
++ (instancetype)initWithModelPath:(NSString *)modelPath contextId:(int)contextId noCoreML:(BOOL)noCoreML noMetal:(BOOL)noMetal;
+- (bool)isMetalEnabled;
+- (NSString *)reasonNoMetal;
 - (struct whisper_context *)getContext;
 - (dispatch_queue_t)getDispatchQueue;
 - (OSStatus)transcribeRealtime:(int)jobId