npm - whisper.rn - Versions diffs - 0.4.1 → 0.4.3 - Mend

whisper.rn 0.4.1 → 0.4.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (74) hide show

package/cpp/ggml.h CHANGED Viewed

@@ -470,6 +470,7 @@ extern "C" {
         WSP_GGML_OP_TRANSPOSE,
         WSP_GGML_OP_GET_ROWS,
         WSP_GGML_OP_GET_ROWS_BACK,
+        WSP_GGML_OP_SET_ROWS,
         WSP_GGML_OP_DIAG,
         WSP_GGML_OP_DIAG_MASK_INF,
         WSP_GGML_OP_DIAG_MASK_ZERO,
@@ -481,6 +482,7 @@ extern "C" {
         WSP_GGML_OP_CONV_TRANSPOSE_1D,
         WSP_GGML_OP_IM2COL,
         WSP_GGML_OP_IM2COL_BACK,
+        WSP_GGML_OP_CONV_2D,
         WSP_GGML_OP_CONV_2D_DW,
         WSP_GGML_OP_CONV_TRANSPOSE_2D,
         WSP_GGML_OP_POOL_1D,
@@ -519,6 +521,8 @@ extern "C" {
         WSP_GGML_OP_CROSS_ENTROPY_LOSS_BACK,
         WSP_GGML_OP_OPT_STEP_ADAMW,
+        WSP_GGML_OP_GLU,
         WSP_GGML_OP_COUNT,
     };
@@ -542,6 +546,14 @@ extern "C" {
         WSP_GGML_UNARY_OP_COUNT,
     };
+    enum wsp_ggml_glu_op {
+        WSP_GGML_GLU_OP_REGLU,
+        WSP_GGML_GLU_OP_GEGLU,
+        WSP_GGML_GLU_OP_SWIGLU,
+        WSP_GGML_GLU_OP_COUNT,
+    };
     enum wsp_ggml_object_type {
         WSP_GGML_OBJECT_TYPE_TENSOR,
         WSP_GGML_OBJECT_TYPE_GRAPH,
@@ -657,6 +669,7 @@ extern "C" {
     WSP_GGML_API const char * wsp_ggml_op_symbol(enum wsp_ggml_op   op);
     WSP_GGML_API const char * wsp_ggml_unary_op_name(enum wsp_ggml_unary_op op);
+    WSP_GGML_API const char * wsp_ggml_glu_op_name(enum wsp_ggml_glu_op op);
     WSP_GGML_API const char * wsp_ggml_op_desc(const struct wsp_ggml_tensor * t); // unary or op name
     WSP_GGML_API size_t  wsp_ggml_element_size(const struct wsp_ggml_tensor * tensor);
@@ -687,6 +700,9 @@ extern "C" {
     // true for tensor that is stored in memory as CxWxHxN and has been permuted to WxHxCxN
     WSP_GGML_API bool wsp_ggml_is_contiguous_channels(const struct wsp_ggml_tensor * tensor);
+    // true if the elements in dimension 0 are contiguous, or there is just 1 block of elements
+    WSP_GGML_API bool wsp_ggml_is_contiguous_rows(const struct wsp_ggml_tensor * tensor);
     WSP_GGML_API bool wsp_ggml_are_same_shape (const struct wsp_ggml_tensor * t0, const struct wsp_ggml_tensor * t1);
     WSP_GGML_API bool wsp_ggml_are_same_stride(const struct wsp_ggml_tensor * t0, const struct wsp_ggml_tensor * t1);
@@ -758,6 +774,7 @@ extern "C" {
     WSP_GGML_API void wsp_ggml_unravel_index(const struct wsp_ggml_tensor * tensor, int64_t i, int64_t * i0, int64_t * i1, int64_t * i2, int64_t * i3);
     WSP_GGML_API enum wsp_ggml_unary_op wsp_ggml_get_unary_op(const struct wsp_ggml_tensor * tensor);
+    WSP_GGML_API enum wsp_ggml_glu_op wsp_ggml_get_glu_op(const struct wsp_ggml_tensor * tensor);
     WSP_GGML_API void *  wsp_ggml_get_data    (const struct wsp_ggml_tensor * tensor);
     WSP_GGML_API float * wsp_ggml_get_data_f32(const struct wsp_ggml_tensor * tensor);
@@ -1086,6 +1103,63 @@ extern "C" {
             struct wsp_ggml_context * ctx,
             struct wsp_ggml_tensor  * a);
+    // gated linear unit ops
+    // A: n columns, r rows,
+    // result is n / 2 columns, r rows,
+    // expects gate in second half of row, unless swapped is true
+    WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_glu(
+            struct wsp_ggml_context * ctx,
+             struct wsp_ggml_tensor * a,
+             enum wsp_ggml_glu_op     op,
+             bool                 swapped);
+    WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_reglu(
+            struct wsp_ggml_context * ctx,
+            struct wsp_ggml_tensor  * a);
+    WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_reglu_swapped(
+            struct wsp_ggml_context * ctx,
+            struct wsp_ggml_tensor  * a);
+    WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_geglu(
+            struct wsp_ggml_context * ctx,
+            struct wsp_ggml_tensor  * a);
+    WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_geglu_swapped(
+            struct wsp_ggml_context * ctx,
+            struct wsp_ggml_tensor  * a);
+    WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_swiglu(
+            struct wsp_ggml_context * ctx,
+            struct wsp_ggml_tensor  * a);
+    WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_swiglu_swapped(
+            struct wsp_ggml_context * ctx,
+            struct wsp_ggml_tensor  * a);
+    // A: n columns, r rows,
+    // B: n columns, r rows,
+    WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_glu_split(
+            struct wsp_ggml_context * ctx,
+             struct wsp_ggml_tensor * a,
+             struct wsp_ggml_tensor * b,
+             enum wsp_ggml_glu_op     op);
+    WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_reglu_split(
+            struct wsp_ggml_context * ctx,
+            struct wsp_ggml_tensor  * a,
+            struct wsp_ggml_tensor  * b);
+    WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_geglu_split(
+            struct wsp_ggml_context * ctx,
+            struct wsp_ggml_tensor  * a,
+            struct wsp_ggml_tensor  * b);
+    WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_swiglu_split(
+            struct wsp_ggml_context * ctx,
+            struct wsp_ggml_tensor  * a,
+            struct wsp_ggml_tensor  * b);
     // normalize along rows
     WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_norm(
             struct wsp_ggml_context * ctx,
@@ -1375,6 +1449,23 @@ extern "C" {
             struct wsp_ggml_tensor  * b,  // row indices
             struct wsp_ggml_tensor  * c); // data for wsp_ggml_get_rows, only used for its shape
+    // a TD  [n_embd, ne1,    ne2,    ne3]
+    // b TS  [n_embd, n_rows, ne02,   ne03] | ne02 == ne2, ne03 == ne3
+    // c I64 [n_rows, ne11,   ne12,   1]    | c[i] in [0, ne1)
+    //
+    // undefined behavior if destination rows overlap
+    //
+    // broadcast:
+    //   ne2 % ne11 == 0
+    //   ne3 % ne12 == 0
+    //
+    // return view(a)
+    WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_set_rows(
+            struct wsp_ggml_context * ctx,
+            struct wsp_ggml_tensor  * a,  // destination
+            struct wsp_ggml_tensor  * b,  // source
+            struct wsp_ggml_tensor  * c); // row indices
     WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_diag(
         struct wsp_ggml_context     * ctx,
         struct wsp_ggml_tensor      * a);
@@ -1723,6 +1814,17 @@ extern "C" {
             struct wsp_ggml_tensor  * b,
             int                   stride);
+    WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_conv_2d_direct(
+            struct wsp_ggml_context * ctx,
+            struct wsp_ggml_tensor  * a,   // convolution kernel [KW, KH, IC, OC]
+            struct wsp_ggml_tensor  * b,   // input data [W, H, C, N]
+            int                   s0,  // stride dimension 0
+            int                   s1,  // stride dimension 1
+            int                   p0,  // padding dimension 0
+            int                   p1,  // padding dimension 1
+            int                   d0,  // dilation dimension 0
+            int                   d1); // dilation dimension 1
     enum wsp_ggml_op_pool {
         WSP_GGML_OP_POOL_MAX,
         WSP_GGML_OP_POOL_AVG,
@@ -1765,6 +1867,12 @@ extern "C" {
     enum wsp_ggml_scale_mode {
         WSP_GGML_SCALE_MODE_NEAREST  = 0,
         WSP_GGML_SCALE_MODE_BILINEAR = 1,
+        WSP_GGML_SCALE_MODE_COUNT
+    };
+    enum wsp_ggml_scale_flag {
+        WSP_GGML_SCALE_FLAG_ALIGN_CORNERS = (1 << 8)
     };
     // interpolate
@@ -1777,14 +1885,26 @@ extern "C" {
     // interpolate
     // interpolate scale to specified dimensions
-    WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_upscale_ext(
+    WSP_GGML_DEPRECATED(WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_upscale_ext(
             struct wsp_ggml_context * ctx,
             struct wsp_ggml_tensor  * a,
             int                   ne0,
             int                   ne1,
             int                   ne2,
             int                   ne3,
-            enum wsp_ggml_scale_mode  mode);
+            enum wsp_ggml_scale_mode  mode),
+        "use wsp_ggml_interpolate instead");
+    // Up- or downsamples the input to the specified size.
+    // 2D scale modes (eg. bilinear) are applied to the first two dimensions.
+    WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_interpolate(
+            struct wsp_ggml_context * ctx,
+            struct wsp_ggml_tensor  * a,
+            int64_t               ne0,
+            int64_t               ne1,
+            int64_t               ne2,
+            int64_t               ne3,
+            uint32_t              mode); // wsp_ggml_scale_mode [ | wsp_ggml_scale_flag...]
     // pad each dimension with zeros: [x, ..., x] -> [x, ..., x, 0, ..., 0]
     WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_pad(

package/cpp/gguf.cpp CHANGED Viewed

@@ -335,7 +335,11 @@ struct wsp_gguf_context * wsp_gguf_init_from_file_impl(FILE * file, struct wsp_g
         for (uint32_t i = 0; i < magic.size(); i++) {
             if (magic[i] != WSP_GGUF_MAGIC[i]) {
-                WSP_GGML_LOG_ERROR("%s: invalid magic characters: '%c%c%c%c', expected 'GGUF'\n", __func__, magic[0], magic[1], magic[2], magic[3]);
+                char c0 = isprint(magic[0]) ? magic[0] : '?';
+                char c1 = isprint(magic[1]) ? magic[1] : '?';
+                char c2 = isprint(magic[2]) ? magic[2] : '?';
+                char c3 = isprint(magic[3]) ? magic[3] : '?';
+                WSP_GGML_LOG_ERROR("%s: invalid magic characters: '%c%c%c%c', expected 'GGUF'\n", __func__, c0, c1, c2, c3);
                 wsp_gguf_free(ctx);
                 return nullptr;
             }

package/cpp/whisper.cpp CHANGED Viewed

@@ -8942,6 +8942,10 @@ void whisper_log_set(wsp_ggml_log_callback log_callback, void * user_data) {
     wsp_ggml_log_set(g_state.log_callback, g_state.log_callback_user_data);
 }
+const char * whisper_version(void) {
+    return "1.7.6";
+}
 WSP_GGML_ATTRIBUTE_FORMAT(2, 3)
 static void whisper_log_internal(wsp_ggml_log_level level, const char * format, ...) {
     va_list args;

package/cpp/whisper.h CHANGED Viewed

@@ -199,6 +199,8 @@ extern "C" {
         float samples_overlap;         // Overlap in seconds when copying audio samples from speech segment.
     } whisper_vad_params;
+    WHISPER_API const char * whisper_version(void);
     // Various functions for loading a ggml whisper model.
     // Allocate (almost) all memory needed for the model.
     // Return NULL on failure

package/ios/RNWhisper.mm CHANGED Viewed

@@ -352,39 +352,10 @@ RCT_REMAP_METHOD(releaseAllContexts,
                  withResolver:(RCTPromiseResolveBlock)resolve
                  withRejecter:(RCTPromiseRejectBlock)reject)
 {
-    [self invalidate];
+    [self releaseAllContexts];
     resolve(nil);
 }
-- (void)invalidate {
-    [super invalidate];
-    if (contexts == nil) {
-        return;
-    }
-    for (NSNumber *contextId in contexts) {
-        RNWhisperContext *context = contexts[contextId];
-        [context invalidate];
-    }
-    if (vadContexts != nil) {
-        for (NSNumber *contextId in vadContexts) {
-            RNWhisperVadContext *vadContext = vadContexts[contextId];
-            [vadContext invalidate];
-        }
-        [vadContexts removeAllObjects];
-        vadContexts = nil;
-    }
-    rnwhisper::job_abort_all(); // graceful abort
-    [contexts removeAllObjects];
-    contexts = nil;
-    [RNWhisperDownloader clearCache];
-}
 // MARK: - AudioSessionUtils
 RCT_EXPORT_METHOD(getAudioSessionCurrentCategory:(RCTPromiseResolveBlock)resolve
@@ -507,13 +478,16 @@ RCT_REMAP_METHOD(vadDetectSpeech,
     }
     // Decode base64 audio data
-    NSData *audioData = [[NSData alloc] initWithBase64EncodedString:audioDataBase64 options:0];
-    if (audioData == nil) {
+    NSData *pcmData = [[NSData alloc] initWithBase64EncodedString:audioDataBase64 options:0];
+    if (pcmData == nil) {
         reject(@"whisper_vad_error", @"Invalid audio data", nil);
         return;
     }
-    NSArray *segments = [vadContext detectSpeech:audioData options:options];
+    int count = 0;
+    float *data = [RNWhisperAudioUtils decodeWaveData:pcmData count:&count cutHeader:NO];
+    NSArray *segments = [vadContext detectSpeech:data samplesCount:count options:options];
     resolve(segments);
 }
@@ -549,10 +523,7 @@ RCT_REMAP_METHOD(vadDetectSpeechFile,
         return;
     }
-    // Convert float32 data to NSData for VAD context
-    NSData *audioData = [NSData dataWithBytes:data length:count * sizeof(float)];
-    NSArray *segments = [vadContext detectSpeech:audioData options:options];
+    NSArray *segments = [vadContext detectSpeech:data samplesCount:count options:options];
     resolve(segments);
 }
@@ -574,14 +545,40 @@ RCT_REMAP_METHOD(releaseVadContext,
 RCT_EXPORT_METHOD(releaseAllVadContexts:(RCTPromiseResolveBlock)resolve
                  withRejecter:(RCTPromiseRejectBlock)reject)
 {
+    [self releaseAllVadContexts];
+    resolve(nil);
+}
+- (void)releaseAllContexts {
+    rnwhisper::job_abort_all(); // graceful abort
+    if (contexts != nil) {
+        for (NSNumber *contextId in contexts) {
+            RNWhisperContext *context = contexts[contextId];
+            [context invalidate];
+        }
+        [contexts removeAllObjects];
+        contexts = nil;
+    }
+}
+- (void)releaseAllVadContexts {
     if (vadContexts != nil) {
         for (NSNumber *contextId in vadContexts) {
             RNWhisperVadContext *vadContext = vadContexts[contextId];
             [vadContext invalidate];
         }
         [vadContexts removeAllObjects];
+        vadContexts = nil;
     }
-    resolve(nil);
+}
+- (void)invalidate {
+    [super invalidate];
+    [self releaseAllContexts];
+    [self releaseAllVadContexts];
+    [RNWhisperDownloader clearCache];
 }
 #ifdef RCT_NEW_ARCH_ENABLED

package/ios/RNWhisperVadContext.h CHANGED Viewed

@@ -23,7 +23,7 @@
 - (NSString *)reasonNoMetal;
 - (struct whisper_vad_context *)getVadContext;
 - (dispatch_queue_t)getDispatchQueue;
-- (NSArray *)detectSpeech:(NSData *)audioData options:(NSDictionary *)options;
+- (NSArray *)detectSpeech:(float *)samples samplesCount:(int)samplesCount options:(NSDictionary *)options;
 - (void)invalidate;
 @end

package/ios/RNWhisperVadContext.mm CHANGED Viewed

@@ -73,18 +73,14 @@
     return dQueue;
 }
-- (NSArray *)detectSpeech:(NSData *)audioData options:(NSDictionary *)options {
+- (NSArray *)detectSpeech:(float *)samples samplesCount:(int)samplesCount options:(NSDictionary *)options {
     if (vctx == NULL) {
         NSLog(@"VAD context is null");
         return @[];
     }
-    // Convert NSData to float array
-    const float *samples = (const float *)[audioData bytes];
-    int n_samples = (int)[audioData length] / sizeof(float);
     // Run VAD detection
-    bool speechDetected = whisper_vad_detect_speech(vctx, samples, n_samples);
+    bool speechDetected = whisper_vad_detect_speech(vctx, samples, samplesCount);
     if (!speechDetected) {
         return @[];
     }

package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/ggml-backend.h CHANGED Viewed

@@ -339,7 +339,7 @@ extern "C" {
     typedef bool (*wsp_ggml_backend_eval_callback)(int node_index, struct wsp_ggml_tensor * t1, struct wsp_ggml_tensor * t2, void * user_data);
     // Compare the output of two backends
-    WSP_GGML_API bool wsp_ggml_backend_compare_graph_backend(wsp_ggml_backend_t backend1, wsp_ggml_backend_t backend2, struct wsp_ggml_cgraph * graph, wsp_ggml_backend_eval_callback callback, void * user_data);
+    WSP_GGML_API bool wsp_ggml_backend_compare_graph_backend(wsp_ggml_backend_t backend1, wsp_ggml_backend_t backend2, struct wsp_ggml_cgraph * graph, wsp_ggml_backend_eval_callback callback, void * user_data, struct wsp_ggml_tensor * test_node);
     // Tensor initialization
     WSP_GGML_API enum wsp_ggml_status wsp_ggml_backend_tensor_alloc(wsp_ggml_backend_buffer_t buffer, struct wsp_ggml_tensor * tensor, void * addr);

package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/ggml-cpu.h CHANGED Viewed

@@ -101,6 +101,7 @@ extern "C" {
     WSP_GGML_BACKEND_API int wsp_ggml_cpu_has_riscv_v    (void);
     WSP_GGML_BACKEND_API int wsp_ggml_cpu_has_vsx        (void);
     WSP_GGML_BACKEND_API int wsp_ggml_cpu_has_vxe        (void);
+    WSP_GGML_BACKEND_API int wsp_ggml_cpu_has_nnpa       (void);
     WSP_GGML_BACKEND_API int wsp_ggml_cpu_has_wasm_simd  (void);
     WSP_GGML_BACKEND_API int wsp_ggml_cpu_has_llamafile  (void);
@@ -133,6 +134,7 @@ extern "C" {
     WSP_GGML_BACKEND_API wsp_ggml_backend_reg_t wsp_ggml_backend_cpu_reg(void);
+    WSP_GGML_BACKEND_API void wsp_ggml_cpu_fp32_to_fp32(const float *,       float *, int64_t);
     WSP_GGML_BACKEND_API void wsp_ggml_cpu_fp32_to_fp16(const float *, wsp_ggml_fp16_t *, int64_t);
     WSP_GGML_BACKEND_API void wsp_ggml_cpu_fp16_to_fp32(const wsp_ggml_fp16_t *, float *, int64_t);
     WSP_GGML_BACKEND_API void wsp_ggml_cpu_fp32_to_bf16(const float *, wsp_ggml_bf16_t *, int64_t);