npm - whisper.rn - Versions diffs - 0.5.0-rc.8 → 0.5.0 - Mend

whisper.rn 0.5.0-rc.8 → 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (82) hide show

package/cpp/ggml.h CHANGED Viewed

@@ -241,6 +241,8 @@
 #define WSP_GGML_ROPE_TYPE_MROPE  8
 #define WSP_GGML_ROPE_TYPE_VISION 24
+#define WSP_GGML_MROPE_SECTIONS   4
 #define WSP_GGML_UNUSED(x) (void)(x)
 #define WSP_GGML_PAD(x, n) (((x) + (n) - 1) & ~((n) - 1))
@@ -304,6 +306,16 @@
     WSP_GGML_TENSOR_LOCALS(int64_t, ne,  dst,  ne) \
     WSP_GGML_TENSOR_LOCALS(size_t,  nb,  dst,  nb)
+#define WSP_GGML_TENSOR_TERNARY_OP_LOCALS \
+    WSP_GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne) \
+    WSP_GGML_TENSOR_LOCALS(size_t,  nb0, src0, nb) \
+    WSP_GGML_TENSOR_LOCALS(int64_t, ne1, src1, ne) \
+    WSP_GGML_TENSOR_LOCALS(size_t,  nb1, src1, nb) \
+    WSP_GGML_TENSOR_LOCALS(int64_t, ne2, src2, ne) \
+    WSP_GGML_TENSOR_LOCALS(size_t,  nb2, src2, nb) \
+    WSP_GGML_TENSOR_LOCALS(int64_t, ne,  dst,  ne) \
+    WSP_GGML_TENSOR_LOCALS(size_t,  nb,  dst,  nb)
 #define WSP_GGML_TENSOR_BINARY_OP_LOCALS01 \
     WSP_GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne) \
     WSP_GGML_TENSOR_LOCALS(size_t,  nb0, src0, nb) \
@@ -314,6 +326,13 @@
 extern "C" {
 #endif
+    // Function type used in fatal error callbacks
+    typedef void (*wsp_ggml_abort_callback_t)(const char * error_message);
+    // Set the abort callback (passing null will restore original abort functionality: printing a message to stdout)
+    // Returns the old callback for chaining
+    WSP_GGML_API wsp_ggml_abort_callback_t wsp_ggml_set_abort_callback(wsp_ggml_abort_callback_t callback);
     WSP_GGML_NORETURN WSP_GGML_ATTRIBUTE_FORMAT(3, 4)
     WSP_GGML_API void wsp_ggml_abort(const char * file, int line, const char * fmt, ...);
@@ -388,7 +407,8 @@ extern "C" {
         // WSP_GGML_TYPE_IQ4_NL_4_4 = 36,
         // WSP_GGML_TYPE_IQ4_NL_4_8 = 37,
         // WSP_GGML_TYPE_IQ4_NL_8_8 = 38,
-        WSP_GGML_TYPE_COUNT   = 39,
+        WSP_GGML_TYPE_MXFP4   = 39, // MXFP4 (1 block)
+        WSP_GGML_TYPE_COUNT   = 40,
     };
     // precision
@@ -423,6 +443,7 @@ extern "C" {
         WSP_GGML_FTYPE_MOSTLY_IQ4_XS  = 22, // except 1d tensors
         WSP_GGML_FTYPE_MOSTLY_IQ1_M   = 23, // except 1d tensors
         WSP_GGML_FTYPE_MOSTLY_BF16    = 24, // except 1d tensors
+        WSP_GGML_FTYPE_MOSTLY_MXFP4   = 25, // except 1d tensors
     };
     // available tensor operations:
@@ -431,6 +452,7 @@ extern "C" {
         WSP_GGML_OP_DUP,
         WSP_GGML_OP_ADD,
+        WSP_GGML_OP_ADD_ID,
         WSP_GGML_OP_ADD1,
         WSP_GGML_OP_ACC,
         WSP_GGML_OP_SUB,
@@ -488,7 +510,7 @@ extern "C" {
         WSP_GGML_OP_POOL_1D,
         WSP_GGML_OP_POOL_2D,
         WSP_GGML_OP_POOL_2D_BACK,
-        WSP_GGML_OP_UPSCALE, // nearest interpolate
+        WSP_GGML_OP_UPSCALE,
         WSP_GGML_OP_PAD,
         WSP_GGML_OP_PAD_REFLECT_1D,
         WSP_GGML_OP_ROLL,
@@ -520,6 +542,7 @@ extern "C" {
         WSP_GGML_OP_CROSS_ENTROPY_LOSS,
         WSP_GGML_OP_CROSS_ENTROPY_LOSS_BACK,
         WSP_GGML_OP_OPT_STEP_ADAMW,
+        WSP_GGML_OP_OPT_STEP_SGD,
         WSP_GGML_OP_GLU,
@@ -550,6 +573,9 @@ extern "C" {
         WSP_GGML_GLU_OP_REGLU,
         WSP_GGML_GLU_OP_GEGLU,
         WSP_GGML_GLU_OP_SWIGLU,
+        WSP_GGML_GLU_OP_SWIGLU_OAI,
+        WSP_GGML_GLU_OP_GEGLU_ERF,
+        WSP_GGML_GLU_OP_GEGLU_QUICK,
         WSP_GGML_GLU_OP_COUNT,
     };
@@ -639,6 +665,9 @@ extern "C" {
     // misc
+    WSP_GGML_API const char * wsp_ggml_version(void);
+    WSP_GGML_API const char * wsp_ggml_commit(void);
     WSP_GGML_API void    wsp_ggml_time_init(void); // call this once at the beginning of the program
     WSP_GGML_API int64_t wsp_ggml_time_ms(void);
     WSP_GGML_API int64_t wsp_ggml_time_us(void);
@@ -819,6 +848,13 @@ extern "C" {
             struct wsp_ggml_tensor  * b,
             enum   wsp_ggml_type      type);
+    // dst[i0, i1, i2] = a[i0, i1, i2] + b[i0, ids[i1, i2]]
+    WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_add_id(
+            struct wsp_ggml_context * ctx,
+            struct wsp_ggml_tensor  * a,
+            struct wsp_ggml_tensor  * b,
+            struct wsp_ggml_tensor  * ids);
     WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_add1(
             struct wsp_ggml_context * ctx,
             struct wsp_ggml_tensor  * a,
@@ -1137,6 +1173,22 @@ extern "C" {
             struct wsp_ggml_context * ctx,
             struct wsp_ggml_tensor  * a);
+    WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_geglu_erf(
+            struct wsp_ggml_context * ctx,
+            struct wsp_ggml_tensor  * a);
+    WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_geglu_erf_swapped(
+            struct wsp_ggml_context * ctx,
+            struct wsp_ggml_tensor  * a);
+    WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_geglu_quick(
+            struct wsp_ggml_context * ctx,
+            struct wsp_ggml_tensor  * a);
+    WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_geglu_quick_swapped(
+            struct wsp_ggml_context * ctx,
+            struct wsp_ggml_tensor  * a);
     // A: n columns, r rows,
     // B: n columns, r rows,
     WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_glu_split(
@@ -1160,6 +1212,23 @@ extern "C" {
             struct wsp_ggml_tensor  * a,
             struct wsp_ggml_tensor  * b);
+    WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_geglu_erf_split(
+            struct wsp_ggml_context * ctx,
+            struct wsp_ggml_tensor  * a,
+            struct wsp_ggml_tensor  * b);
+    WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_geglu_quick_split(
+            struct wsp_ggml_context * ctx,
+            struct wsp_ggml_tensor  * a,
+            struct wsp_ggml_tensor  * b);
+    WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_swiglu_oai(
+            struct wsp_ggml_context * ctx,
+            struct wsp_ggml_tensor  * a,
+            struct wsp_ggml_tensor  * b,
+            float                 alpha,
+            float                 limit);
     // normalize along rows
     WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_norm(
             struct wsp_ggml_context * ctx,
@@ -1259,6 +1328,19 @@ extern "C" {
             struct wsp_ggml_tensor  * a,
             float                 s);
+    // x = s * a + b
+    WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_scale_bias(
+        struct wsp_ggml_context * ctx,
+        struct wsp_ggml_tensor  * a,
+        float                 s,
+        float                 b);
+    WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_scale_bias_inplace(
+        struct wsp_ggml_context * ctx,
+        struct wsp_ggml_tensor  * a,
+        float                 s,
+        float                 b);
     // b -> view(a,offset,nb1,nb2,3), return modified a
     WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_set(
             struct wsp_ggml_context * ctx,
@@ -1503,8 +1585,14 @@ extern "C" {
             struct wsp_ggml_context * ctx,
             struct wsp_ggml_tensor  * a);
+    // a    [ne0, ne01, ne02, ne03]
+    // mask [ne0, ne11, ne12, ne13] | ne11 >= ne01, F16 or F32, optional
+    //
+    // broadcast:
+    //   ne02 % ne12 == 0
+    //   ne03 % ne13 == 0
+    //
     // fused soft_max(a*scale + mask*(ALiBi slope))
-    // mask is optional
     // max_bias = 0.0f for no ALiBi
     WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_soft_max_ext(
             struct wsp_ggml_context * ctx,
@@ -1513,6 +1601,10 @@ extern "C" {
             float                 scale,
             float                 max_bias);
+    WSP_GGML_API void wsp_ggml_soft_max_add_sinks(
+            struct wsp_ggml_tensor * a,
+            struct wsp_ggml_tensor * sinks);
     WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_soft_max_ext_back(
             struct wsp_ggml_context * ctx,
             struct wsp_ggml_tensor  * a,
@@ -1571,7 +1663,7 @@ extern "C" {
             struct wsp_ggml_tensor  * b,
             struct wsp_ggml_tensor  * c,
             int                   n_dims,
-            int                   sections[4],
+            int                   sections[WSP_GGML_MROPE_SECTIONS],
             int                   mode,
             int                   n_ctx_orig,
             float                 freq_base,
@@ -1597,6 +1689,22 @@ extern "C" {
             float                 beta_fast,
             float                 beta_slow);
+    WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_rope_multi_inplace(
+            struct wsp_ggml_context * ctx,
+            struct wsp_ggml_tensor  * a,
+            struct wsp_ggml_tensor  * b,
+            struct wsp_ggml_tensor  * c,
+            int                   n_dims,
+            int                   sections[WSP_GGML_MROPE_SECTIONS],
+            int                   mode,
+            int                   n_ctx_orig,
+            float                 freq_base,
+            float                 freq_scale,
+            float                 ext_factor,
+            float                 attn_factor,
+            float                 beta_fast,
+            float                 beta_slow);
     WSP_GGML_DEPRECATED(WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_rope_custom(
             struct wsp_ggml_context * ctx,
             struct wsp_ggml_tensor  * a,
@@ -1967,11 +2075,17 @@ extern "C" {
 #define WSP_GGML_KQ_MASK_PAD 64
-    // q:    [n_embd_k, n_batch,     n_head,    1]
-    // k:    [n_embd_k, n_kv,        n_head_kv, 1]
-    // v:    [n_embd_v, n_kv,        n_head_kv, 1] !! not transposed !!
-    // mask: [n_kv,     n_batch_pad, 1,         1] !! n_batch_pad = WSP_GGML_PAD(n_batch, WSP_GGML_KQ_MASK_PAD) !!
-    // res:  [n_embd_v, n_head,      n_batch,   1] !! permuted !!
+    // q:    [n_embd_k, n_batch,     n_head,    ne3 ]
+    // k:    [n_embd_k, n_kv,        n_head_kv, ne3 ]
+    // v:    [n_embd_v, n_kv,        n_head_kv, ne3 ] !! not transposed !!
+    // mask: [n_kv,     n_batch_pad, ne32,      ne33] !! n_batch_pad = WSP_GGML_PAD(n_batch, WSP_GGML_KQ_MASK_PAD) !!
+    // res:  [n_embd_v, n_head,      n_batch,   ne3 ] !! permuted !!
+    //
+    // broadcast:
+    //   n_head % n_head_kv == 0
+    //   n_head % ne32      == 0
+    //   ne3    % ne33      == 0
+    //
     WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_flash_attn_ext(
             struct wsp_ggml_context * ctx,
             struct wsp_ggml_tensor  * q,
@@ -1989,6 +2103,10 @@ extern "C" {
     WSP_GGML_API enum wsp_ggml_prec wsp_ggml_flash_attn_ext_get_prec(
             const struct wsp_ggml_tensor * a);
+    WSP_GGML_API void wsp_ggml_flash_attn_ext_add_sinks(
+            struct wsp_ggml_tensor * a,
+            struct wsp_ggml_tensor * sinks);
     // TODO: needs to be adapted to wsp_ggml_flash_attn_ext
     WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_flash_attn_back(
            struct wsp_ggml_context * ctx,
@@ -2010,7 +2128,8 @@ extern "C" {
             struct wsp_ggml_tensor  * dt,
             struct wsp_ggml_tensor  * A,
             struct wsp_ggml_tensor  * B,
-            struct wsp_ggml_tensor  * C);
+            struct wsp_ggml_tensor  * C,
+            struct wsp_ggml_tensor  * ids);
     // partition into non-overlapping windows with padding if needed
     // example:
@@ -2193,7 +2312,14 @@ extern "C" {
             struct wsp_ggml_tensor  * grad,
             struct wsp_ggml_tensor  * m,
             struct wsp_ggml_tensor  * v,
-            struct wsp_ggml_tensor  * adamw_params); // parameters such a the learning rate
+            struct wsp_ggml_tensor  * adamw_params); // parameters such as the learning rate
+    // stochastic gradient descent step (with weight decay)
+    WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_opt_step_sgd(
+        struct wsp_ggml_context * ctx,
+        struct wsp_ggml_tensor *  a,
+        struct wsp_ggml_tensor *  grad,
+        struct wsp_ggml_tensor *  sgd_params); // alpha, weight decay
     //
     // automatic differentiation

package/cpp/gguf.cpp CHANGED Viewed

@@ -631,7 +631,14 @@ struct wsp_gguf_context * wsp_gguf_init_from_file_impl(FILE * file, struct wsp_g
                 wsp_gguf_free(ctx);
                 return nullptr;
             }
-            ctx->size += WSP_GGML_PAD(wsp_ggml_nbytes(&ti.t), ctx->alignment);
+            size_t padded_size = WSP_GGML_PAD(wsp_ggml_nbytes(&ti.t), ctx->alignment);
+            if (SIZE_MAX - ctx->size < padded_size) {
+                WSP_GGML_LOG_ERROR("%s: tensor '%s' size overflow, cannot accumulate size %zu + %zu\n",
+                    __func__, ti.t.name, ctx->size, padded_size);
+                wsp_gguf_free(ctx);
+                return nullptr;
+            }
+            ctx->size += padded_size;
         }
     }

package/cpp/jsi/RNWhisperJSI.cpp CHANGED Viewed

@@ -548,9 +548,14 @@ void installJSIBindings(
                                 logError("Failed to create job for transcription");
                                 code = -2;
                             } else {
-                                code = whisper_full(context, job->params, audioResult.data.data(), audioResult.count);
-                                if (job->is_aborted()) {
-                                    code = -999;
+                                try {
+                                    code = whisper_full(context, job->params, audioResult.data.data(), audioResult.count);
+                                    if (job->is_aborted()) {
+                                        code = -999;
+                                    }
+                                } catch (...) {
+                                    logError("Exception during whisper_full transcription");
+                                    code = -3;
                                 }
                                 rnwhisper::job_remove(callbackInfo.jobId);
                             }
@@ -567,6 +572,7 @@ void installJSIBindings(
                                         resolvePtr->call(runtime, resultObj);
                                     } else {
                                         std::string errorMsg = (code == -2) ? "Failed to create transcription job" :
+                                                              (code == -3) ? "Transcription failed with exception" :
                                                               (code == -999) ? "Transcription was aborted" :
                                                               "Transcription failed";
                                         auto errorObj = createErrorObject(runtime, errorMsg, code);
@@ -631,9 +637,20 @@ void installJSIBindings(
                             logInfo("Starting whisper_vad_detect_speech: vadContext=%p, audioDataCount=%d",
                                    vadContext, audioResult.count);
-                            // Perform VAD detection
-                            bool isSpeech = whisper_vad_detect_speech(vadContext, audioResult.data.data(), audioResult.count);
-                            logInfo("VAD detection result: %s", isSpeech ? "speech" : "no speech");
+                            // Perform VAD detection with error handling
+                            bool isSpeech = false;
+                            try {
+                                isSpeech = whisper_vad_detect_speech(vadContext, audioResult.data.data(), audioResult.count);
+                                logInfo("VAD detection result: %s", isSpeech ? "speech" : "no speech");
+                            } catch (...) {
+                                logError("Exception during whisper_vad_detect_speech");
+                                callInvoker->invokeAsync([rejectPtr, safeRuntime]() {
+                                    auto& runtime = *safeRuntime;
+                                    auto errorObj = createErrorObject(runtime, "VAD detection failed with exception");
+                                    rejectPtr->call(runtime, errorObj);
+                                });
+                                return;
+                            }
                             struct whisper_vad_params vad_params = vadParams;

package/cpp/whisper.cpp CHANGED Viewed

@@ -1327,7 +1327,7 @@ static wsp_ggml_backend_t whisper_backend_init_gpu(const whisper_context_params
         for (size_t i = 0; i < wsp_ggml_backend_dev_count(); ++i) {
             wsp_ggml_backend_dev_t dev_cur = wsp_ggml_backend_dev_get(i);
             if (wsp_ggml_backend_dev_type(dev_cur) == WSP_GGML_BACKEND_DEVICE_TYPE_GPU) {
-                if (cnt == 0 || cnt == params.gpu_device) {
+                if (cnt == params.gpu_device) {
                     dev = dev_cur;
                 }
@@ -1396,7 +1396,7 @@ static buft_list_t make_buft_list(whisper_context_params & params) {
         for (size_t i = 0; i < wsp_ggml_backend_dev_count(); ++i) {
             wsp_ggml_backend_dev_t dev = wsp_ggml_backend_dev_get(i);
             if (wsp_ggml_backend_dev_type(dev) == WSP_GGML_BACKEND_DEVICE_TYPE_GPU) {
-                if (cnt == 0 || cnt == params.gpu_device) {
+                if (cnt == params.gpu_device) {
                     auto * buft = wsp_ggml_backend_dev_buffer_type(dev);
                     if (buft) {
                         buft_list.emplace_back(dev, buft);
@@ -1438,7 +1438,8 @@ static bool weight_buft_supported(const whisper_hparams & hparams, wsp_ggml_tens
         op_supported = true;
     } else {
         switch (op) {
-            // The current extra_buffer_type implementations only support WSP_GGML_OP_MUL_MAT
+            // The current extra_buffer_type implementations only support WSP_GGML_OP_MUL_MAT and WSP_GGML_OP_GET_ROWS
+            case WSP_GGML_OP_GET_ROWS:
             case WSP_GGML_OP_MUL_MAT: {
                 wsp_ggml_init_params params = {
                     /*.mem_size   =*/ 2 * wsp_ggml_tensor_overhead(),
@@ -1454,9 +1455,15 @@ static bool weight_buft_supported(const whisper_hparams & hparams, wsp_ggml_tens
                 wsp_ggml_tensor * op_tensor = nullptr;
-                int64_t n_ctx = hparams.n_audio_ctx;
-                wsp_ggml_tensor * b = wsp_ggml_new_tensor_4d(ctx, WSP_GGML_TYPE_F32, w->ne[0], n_ctx, w->ne[2], w->ne[3]);
-                op_tensor = wsp_ggml_mul_mat(ctx, w, b);
+                if (op == WSP_GGML_OP_MUL_MAT) {
+                    int64_t n_ctx = hparams.n_audio_ctx;
+                    wsp_ggml_tensor * b = wsp_ggml_new_tensor_4d(ctx, WSP_GGML_TYPE_F32, w->ne[0], n_ctx, w->ne[2], w->ne[3]);
+                    op_tensor = wsp_ggml_mul_mat(ctx, w, b);
+                } else if (op == WSP_GGML_OP_GET_ROWS) {
+                    int64_t num_indices = 8;
+                    wsp_ggml_tensor * indices = wsp_ggml_new_tensor_1d(ctx, WSP_GGML_TYPE_I32, num_indices);
+                    op_tensor = wsp_ggml_get_rows(ctx, w, indices);
+                }
                 // create a temporary dummy buffer for the weight so that supports_op can check the buffer type
                 WSP_GGML_ASSERT(w->buffer == nullptr);
@@ -2425,6 +2432,8 @@ static bool whisper_encode_internal(
                 return false;
             }
         } else {
+            wsp_ggml_backend_sched_reset(sched);
 #if defined(WHISPER_USE_COREML)
             whisper_coreml_encode(wstate.ctx_coreml, mel->ne[0], mel->ne[1], (float *) mel->data, (float *) wstate.embd_enc->data);
 #elif defined(WHISPER_USE_OPENVINO)

package/ios/RNWhisper.mm CHANGED Viewed

@@ -357,10 +357,9 @@ RCT_REMAP_METHOD(releaseContext,
         reject(@"whisper_error", @"Context not found", nil);
         return;
     }
-    [context invalidate];
-    [contexts removeObjectForKey:[NSNumber numberWithInt:contextId]];
-    // Also remove from unified context management
     rnwhisper_jsi::removeContext(contextId);
+    [contexts removeObjectForKey:[NSNumber numberWithInt:contextId]];
+    [context invalidate];
     resolve(nil);
 }
@@ -555,10 +554,9 @@ RCT_REMAP_METHOD(releaseVadContext,
         reject(@"whisper_vad_error", @"VAD context not found", nil);
         return;
     }
-    [vadContext invalidate];
-    [vadContexts removeObjectForKey:[NSNumber numberWithInt:contextId]];
-    // Also remove from unified context management
     rnwhisper_jsi::removeVadContext(contextId);
+    [vadContexts removeObjectForKey:[NSNumber numberWithInt:contextId]];
+    [vadContext invalidate];
     resolve(nil);
 }
@@ -574,6 +572,7 @@ RCT_EXPORT_METHOD(releaseAllVadContexts:(RCTPromiseResolveBlock)resolve
     if (contexts != nil) {
         for (NSNumber *contextId in contexts) {
             RNWhisperContext *context = contexts[contextId];
+            rnwhisper_jsi::removeContext([contextId intValue]);
             [context invalidate];
         }
         [contexts removeAllObjects];
@@ -585,6 +584,7 @@ RCT_EXPORT_METHOD(releaseAllVadContexts:(RCTPromiseResolveBlock)resolve
     if (vadContexts != nil) {
         for (NSNumber *contextId in vadContexts) {
             RNWhisperVadContext *vadContext = vadContexts[contextId];
+            rnwhisper_jsi::removeVadContext([contextId intValue]);
             [vadContext invalidate];
         }
         [vadContexts removeAllObjects];

package/ios/RNWhisperContext.mm CHANGED Viewed

@@ -87,6 +87,8 @@ static void* retained_log_block = nullptr;
 #ifdef WSP_GGML_USE_METAL
     if (cparams.use_gpu) {
+        cparams.gpu_device = 0;
         id<MTLDevice> device = MTLCreateSystemDefaultDevice();
         // Check ggml-metal availability

package/ios/RNWhisperVadContext.mm CHANGED Viewed

@@ -20,6 +20,8 @@
 #ifdef WSP_GGML_USE_METAL
     if (ctx_params.use_gpu) {
+        ctx_params.gpu_device = 0;
         id<MTLDevice> device = MTLCreateSystemDefaultDevice();
         // Check ggml-metal availability

package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/ggml-common.h CHANGED Viewed

@@ -99,6 +99,9 @@ typedef sycl::half2 wsp_ggml_half2;
 #define QI4_1 (QK4_1 / (4 * QR4_1))
 #define QR4_1 2
+#define QI_MXFP4 (QK_MXFP4 / (4 * QR_MXFP4))
+#define QR_MXFP4 2
 #define QI5_0 (QK5_0 / (4 * QR5_0))
 #define QR5_0 2
@@ -184,6 +187,13 @@ typedef struct {
 } block_q4_1;
 static_assert(sizeof(block_q4_1) == 2 * sizeof(wsp_ggml_half) + QK4_1 / 2, "wrong q4_1 block size/padding");
+#define QK_MXFP4 32
+typedef struct {
+    uint8_t e; // E8M0
+    uint8_t qs[QK_MXFP4/2];
+} block_mxfp4;
+static_assert(sizeof(block_mxfp4) == sizeof(uint8_t) + QK_MXFP4/2, "wrong mxfp4 block size/padding");
 #define QK5_0 32
 typedef struct {
     wsp_ggml_half d;           // delta
@@ -1074,10 +1084,17 @@ WSP_GGML_TABLE_BEGIN(uint32_t, iq3s_grid, 512)
     0x0f090307, 0x0f090501, 0x0f090b01, 0x0f0b0505, 0x0f0b0905, 0x0f0d0105, 0x0f0d0703, 0x0f0f0101,
 WSP_GGML_TABLE_END()
+// TODO: fix name to kvalues_iq4_nl
 WSP_GGML_TABLE_BEGIN(int8_t, kvalues_iq4nl, 16)
     -127, -104, -83, -65, -49, -35, -22, -10, 1, 13, 25, 38, 53, 69, 89, 113,
 WSP_GGML_TABLE_END()
+// e2m1 values (doubled)
+// ref: https://www.opencompute.org/documents/ocp-microscaling-formats-mx-v1-0-spec-final-pdf
+WSP_GGML_TABLE_BEGIN(int8_t, kvalues_mxfp4, 16)
+    0, 1, 2, 3, 4, 6, 8, 12, 0, -1, -2, -3, -4, -6, -8, -12,
+WSP_GGML_TABLE_END()
 #define NGRID_IQ1S 2048
 #define IQ1S_DELTA 0.125f
 #define IQ1M_DELTA 0.125f

package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/ggml-impl.h CHANGED Viewed

@@ -73,6 +73,22 @@ static inline int wsp_ggml_up(int n, int m) {
     return (n + m - 1) & ~(m - 1);
 }
+// TODO: move to ggml.h?
+static bool wsp_ggml_are_same_layout(const struct wsp_ggml_tensor * a, const struct wsp_ggml_tensor * b) {
+    if (a->type != b->type) {
+        return false;
+    }
+    for (int i = 0; i < WSP_GGML_MAX_DIMS; i++) {
+        if (a->ne[i] != b->ne[i]) {
+            return false;
+        }
+        if (a->nb[i] != b->nb[i]) {
+            return false;
+        }
+    }
+    return true;
+}
 //
 // logging
 //
@@ -394,6 +410,67 @@ static inline wsp_ggml_fp16_t wsp_ggml_compute_fp32_to_fp16(float f) {
 #define WSP_GGML_FP16_TO_FP32(x) WSP_GGML_COMPUTE_FP16_TO_FP32(x)
 #define WSP_GGML_FP32_TO_FP16(x) WSP_GGML_COMPUTE_FP32_TO_FP16(x)
+static inline float wsp_ggml_e8m0_to_fp32(uint8_t x) {
+    uint32_t bits;  // Stores the raw bit representation of the float
+    // Handle special case for minimum exponent (denormalized float)
+    if (x == 0) {
+        // Bit pattern for 2^(-127):
+        // - Sign bit: 0 (positive)
+        // - Exponent: 0 (denormalized number)
+        // - Mantissa: 0x400000 (0.5 in fractional form)
+        // Value = 0.5 * 2^(-126) = 2^(-127)
+        bits = 0x00400000;
+    }
+    // note: disabled as we don't need to handle NaNs
+    //// Handle special case for NaN (all bits set)
+    //else if (x == 0xFF) {
+    //    // Standard quiet NaN pattern:
+    //    // - Sign bit: 0
+    //    // - Exponent: all 1s (0xFF)
+    //    // - Mantissa: 0x400000 (quiet NaN flag)
+    //    bits = 0x7FC00000;
+    //}
+    // Normalized values (most common case)
+    else {
+        // Construct normalized float by shifting exponent into position:
+        // - Exponent field: 8 bits (positions 30-23)
+        // - Mantissa: 0 (implicit leading 1)
+        // Value = 2^(x - 127)
+        bits = (uint32_t) x << 23;
+    }
+    float result;  // Final float value
+                   // Safely reinterpret bit pattern as float without type-punning issues
+    memcpy(&result, &bits, sizeof(float));
+    return result;
+}
+// Equal to wsp_ggml_e8m0_to_fp32/2
+// Useful with MXFP4 quantization since the E0M2 values are doubled
+static inline float wsp_ggml_e8m0_to_fp32_half(uint8_t x) {
+    uint32_t bits;
+    // For x < 2: use precomputed denormal patterns
+    if (x < 2) {
+        // 0x00200000 = 2^(-128), 0x00400000 = 2^(-127)
+        bits = 0x00200000 << x;
+    }
+    // For x >= 2: normalized exponent adjustment
+    else {
+        // 0.5 * 2^(x-127) = 2^(x-128) = normalized with exponent (x-1)
+        bits = (uint32_t)(x - 1) << 23;
+    }
+    // Note: NaNs are not handled here
+    float result;
+    memcpy(&result, &bits, sizeof(float));
+    return result;
+}
+#define WSP_GGML_E8M0_TO_FP32(x) wsp_ggml_e8m0_to_fp32(x)
+#define WSP_GGML_E8M0_TO_FP32_HALF(x) wsp_ggml_e8m0_to_fp32_half(x)
 /**
  * Converts brain16 to float32.
  *