npm - whisper.rn - Versions diffs - 0.5.0-rc.9 → 0.5.1 - Mend

whisper.rn 0.5.0-rc.9 → 0.5.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (136) hide show

package/cpp/whisper.cpp CHANGED Viewed

@@ -21,14 +21,12 @@
 #define _USE_MATH_DEFINES
 #include <cmath>
 #include <climits>
-#include <codecvt>
 #include <cstdarg>
 #include <cstdio>
 #include <cstring>
 #include <fstream>
 #include <functional>
 #include <map>
-#include <mutex>
 #include <random>
 #include <regex>
 #include <set>
@@ -36,6 +34,10 @@
 #include <thread>
 #include <vector>
+#ifdef _MSC_VER
+#include <codecvt>
+#endif
 #if defined(WHISPER_BIG_ENDIAN)
 template<typename T>
 static T byteswap(T value) {
@@ -138,6 +140,10 @@ static void whisper_log_callback_default(wsp_ggml_log_level level, const char *
     } while (0)
 #define WHISPER_MAX_DECODERS 8
+// temperature below which we condition on past text history
+static constexpr float WHISPER_HISTORY_CONDITIONING_TEMP_CUTOFF = 0.5f;
 #define WHISPER_MAX_NODES 4096
 static std::string format(const char * fmt, ...) {
@@ -252,45 +258,6 @@ static void whisper_set_i32_nd(struct wsp_ggml_tensor * t, int64_t i0, int64_t i
     *(int32_t *) data = v;
 }
-// faster matrix multiplications for tensors that do not have dimension 0 divisible by "pad"
-// the idea is to represent the original matrix multiplication:
-//
-//   Z = X @ Y
-//
-// with the sum of two matrix multiplications:
-//
-//   Z = (X_0 @ Y_0) + (X_1 @ Y_1)
-//
-// here X_0 and Y_0 are views of X and Y that have dimension 0 divisible by "pad"
-// and X_1 and Y_1 are the remaining views. X_1 and Y_1 end up being small matrices that can be processed with more
-// general-purpose kernels
-//
-static struct wsp_ggml_tensor * wsp_ggml_mul_mat_pad(struct wsp_ggml_context * ctx, struct wsp_ggml_tensor * x, struct wsp_ggml_tensor * y, int pad = 32) {
-    // use padding only if dimension 0 is at least 8 times larger than the padding
-    // else we won't get much benefit from the optimization
-    const int n_pad_req = 8;
-    if (x->ne[0] % pad == 0 || x->ne[0] / pad < n_pad_req) {
-        return wsp_ggml_mul_mat(ctx, x, y);
-    }
-    struct wsp_ggml_tensor * x_0 = wsp_ggml_view_3d(ctx, x, (x->ne[0]/pad)*pad, x->ne[1], x->ne[2], x->nb[1], x->nb[2], 0);
-    struct wsp_ggml_tensor * x_1 = wsp_ggml_view_3d(ctx, x,  x->ne[0]%pad,      x->ne[1], x->ne[2], x->nb[1], x->nb[2], x_0->ne[0]*x_0->nb[0]);
-    struct wsp_ggml_tensor * y_0 = wsp_ggml_view_3d(ctx, y, (y->ne[0]/pad)*pad, y->ne[1], y->ne[2], y->nb[1], y->nb[2], 0);
-    struct wsp_ggml_tensor * y_1 = wsp_ggml_view_3d(ctx, y,  y->ne[0]%pad,      y->ne[1], y->ne[2], y->nb[1], y->nb[2], y_0->ne[0]*y_0->nb[0]);
-    return wsp_ggml_add(ctx,
-            wsp_ggml_mul_mat(ctx, x_0, y_0),
-            wsp_ggml_mul_mat(ctx, x_1, y_1));
-}
-// TODO: check if other platforms can benefit from this optimization
-// TODO: CUDA is currently broken - seems wsp_ggml_mul_mat does not handle views correctly
-#if defined(WSP_GGML_USE_METAL)
-#define wsp_ggml_mul_mat wsp_ggml_mul_mat_pad
-#endif
 // available whisper models
 enum e_model {
     MODEL_UNKNOWN,
@@ -919,7 +886,10 @@ struct whisper_state {
     std::vector<float> logits;
     std::vector<whisper_segment> result_all;
-    std::vector<whisper_token>   prompt_past;
+    // prompt history split into static prefix (prompt_past0) and dynamic rolling context (prompt_past1)
+    std::vector<whisper_token>   prompt_past0; // static carried initial prompt (if enabled)
+    std::vector<whisper_token>   prompt_past1; // dynamic context from decoded output
     int lang_id = 0; // english by default
@@ -1327,7 +1297,7 @@ static wsp_ggml_backend_t whisper_backend_init_gpu(const whisper_context_params
         for (size_t i = 0; i < wsp_ggml_backend_dev_count(); ++i) {
             wsp_ggml_backend_dev_t dev_cur = wsp_ggml_backend_dev_get(i);
             if (wsp_ggml_backend_dev_type(dev_cur) == WSP_GGML_BACKEND_DEVICE_TYPE_GPU) {
-                if (cnt == 0 || cnt == params.gpu_device) {
+                if (cnt == params.gpu_device) {
                     dev = dev_cur;
                 }
@@ -1396,7 +1366,7 @@ static buft_list_t make_buft_list(whisper_context_params & params) {
         for (size_t i = 0; i < wsp_ggml_backend_dev_count(); ++i) {
             wsp_ggml_backend_dev_t dev = wsp_ggml_backend_dev_get(i);
             if (wsp_ggml_backend_dev_type(dev) == WSP_GGML_BACKEND_DEVICE_TYPE_GPU) {
-                if (cnt == 0 || cnt == params.gpu_device) {
+                if (cnt == params.gpu_device) {
                     auto * buft = wsp_ggml_backend_dev_buffer_type(dev);
                     if (buft) {
                         buft_list.emplace_back(dev, buft);
@@ -1438,7 +1408,8 @@ static bool weight_buft_supported(const whisper_hparams & hparams, wsp_ggml_tens
         op_supported = true;
     } else {
         switch (op) {
-            // The current extra_buffer_type implementations only support WSP_GGML_OP_MUL_MAT
+            // The current extra_buffer_type implementations only support WSP_GGML_OP_MUL_MAT and WSP_GGML_OP_GET_ROWS
+            case WSP_GGML_OP_GET_ROWS:
             case WSP_GGML_OP_MUL_MAT: {
                 wsp_ggml_init_params params = {
                     /*.mem_size   =*/ 2 * wsp_ggml_tensor_overhead(),
@@ -1454,9 +1425,15 @@ static bool weight_buft_supported(const whisper_hparams & hparams, wsp_ggml_tens
                 wsp_ggml_tensor * op_tensor = nullptr;
-                int64_t n_ctx = hparams.n_audio_ctx;
-                wsp_ggml_tensor * b = wsp_ggml_new_tensor_4d(ctx, WSP_GGML_TYPE_F32, w->ne[0], n_ctx, w->ne[2], w->ne[3]);
-                op_tensor = wsp_ggml_mul_mat(ctx, w, b);
+                if (op == WSP_GGML_OP_MUL_MAT) {
+                    int64_t n_ctx = hparams.n_audio_ctx;
+                    wsp_ggml_tensor * b = wsp_ggml_new_tensor_4d(ctx, WSP_GGML_TYPE_F32, w->ne[0], n_ctx, w->ne[2], w->ne[3]);
+                    op_tensor = wsp_ggml_mul_mat(ctx, w, b);
+                } else if (op == WSP_GGML_OP_GET_ROWS) {
+                    int64_t num_indices = 8;
+                    wsp_ggml_tensor * indices = wsp_ggml_new_tensor_1d(ctx, WSP_GGML_TYPE_I32, num_indices);
+                    op_tensor = wsp_ggml_get_rows(ctx, w, indices);
+                }
                 // create a temporary dummy buffer for the weight so that supports_op can check the buffer type
                 WSP_GGML_ASSERT(w->buffer == nullptr);
@@ -2425,6 +2402,8 @@ static bool whisper_encode_internal(
                 return false;
             }
         } else {
+            wsp_ggml_backend_sched_reset(sched);
 #if defined(WHISPER_USE_COREML)
             whisper_coreml_encode(wstate.ctx_coreml, mel->ne[0], mel->ne[1], (float *) mel->data, (float *) wstate.embd_enc->data);
 #elif defined(WHISPER_USE_OPENVINO)
@@ -3626,7 +3605,7 @@ struct whisper_context_params whisper_context_default_params() {
     struct whisper_context_params result = {
         /*.use_gpu              =*/ true,
         /*.use_coreml           =*/ false,
-        /*.flash_attn           =*/ false,
+        /*.flash_attn           =*/ true,
         /*.gpu_device           =*/ 0,
         /*.dtw_token_timestamps =*/ false,
@@ -4710,6 +4689,7 @@ static bool whisper_vad_init_context(whisper_vad_context * vctx) {
     wsp_ggml_set_name(vctx->c_state, "c_state");
     vctx->buffer = wsp_ggml_backend_alloc_ctx_tensors(ctx, vctx->backends[0]);
+    wsp_ggml_free(ctx);
     if (!vctx->buffer) {
         WHISPER_LOG_ERROR("%s: failed to allocate memory for the VAD state\n", __func__);
         return false;
@@ -5454,6 +5434,9 @@ struct whisper_vad_segments * whisper_vad_segments_from_samples(
 void whisper_vad_free(whisper_vad_context * ctx) {
     if (ctx) {
+        if (ctx->buffer) {
+            wsp_ggml_backend_buffer_free(ctx->buffer);
+        }
         for (wsp_ggml_context * context : ctx->model.ctxs) {
             wsp_ggml_free(context);
         }
@@ -5468,6 +5451,9 @@ void whisper_vad_free(whisper_vad_context * ctx) {
             wsp_ggml_backend_free(backend);
         }
+        delete[] ctx->model.hparams.encoder_in_channels;
+        delete[] ctx->model.hparams.encoder_out_channels;
+        delete[] ctx->model.hparams.kernel_sizes;
         delete ctx;
     }
@@ -5947,9 +5933,10 @@ struct whisper_full_params whisper_full_default_params(enum whisper_sampling_str
         /* suppress_regex    =*/ nullptr,
-        /*.initial_prompt    =*/ nullptr,
-        /*.prompt_tokens     =*/ nullptr,
-        /*.prompt_n_tokens   =*/ 0,
+        /*.initial_prompt       =*/ nullptr,
+        /*.carry_initial_prompt =*/ false,
+        /*.prompt_tokens        =*/ nullptr,
+        /*.prompt_n_tokens      =*/ 0,
         /*.language          =*/ "en",
         /*.detect_language   =*/ false,
@@ -6645,6 +6632,10 @@ static bool whisper_vad(
     whisper_vad_segments * vad_segments = whisper_vad_segments_from_samples(vctx, vad_params, samples, n_samples);
+    if (!vad_segments) {
+        return false;
+    }
     if (vad_segments->data.size() > 0) {
         state->has_vad_segments = true;
         ctx->state->vad_segments.clear();
@@ -6687,7 +6678,6 @@ static bool whisper_vad(
         } catch (const std::bad_alloc & /* e */) {
             WHISPER_LOG_ERROR("%s: failed to allocate memory for filtered samples\n", __func__);
             whisper_vad_free_segments(vad_segments);
-            whisper_vad_free(vctx);
             return false;
         }
@@ -6793,6 +6783,7 @@ static bool whisper_vad(
                         __func__, n_samples, filtered_n_samples, 100.0f * (1.0f - (float)filtered_n_samples / n_samples));
     }
+    whisper_vad_free_segments(vad_segments);
     return true;
 }
@@ -6901,17 +6892,22 @@ int whisper_full_with_state(
         decoder.rng = std::mt19937(j);
     }
-    // the accumulated text context so far
-    auto & prompt_past = state->prompt_past;
+    // the accumulated text context split into static (prompt_past0) and dynamic (prompt_past1)
+    auto & prompt_past0 = state->prompt_past0;
+    auto & prompt_past1 = state->prompt_past1;
     if (params.no_context) {
-        prompt_past.clear();
+        prompt_past0.clear();
+        prompt_past1.clear();
     }
+    // calculate the maximum context budget for prompt history
+    const int max_prompt_ctx = std::min(params.n_max_text_ctx, whisper_n_text_ctx(ctx)/2);
     // prepare prompt
     {
         std::vector<whisper_token> prompt_tokens;
-        // initial prompt
+        // tokenize the initial prompt
         if (!params.prompt_tokens && params.initial_prompt) {
             prompt_tokens.resize(1024);
             int n_needed = whisper_tokenize(ctx, params.initial_prompt, prompt_tokens.data(), prompt_tokens.size());
@@ -6923,14 +6919,25 @@ int whisper_full_with_state(
             params.prompt_tokens   = prompt_tokens.data();
             params.prompt_n_tokens = prompt_tokens.size();
         }
-        // prepend the prompt tokens to the prompt_past
         if (params.prompt_tokens && params.prompt_n_tokens > 0) {
-            // parse tokens from the pointer
-            for (int i = 0; i < params.prompt_n_tokens; i++) {
-                prompt_past.push_back(params.prompt_tokens[i]);
+            if (params.carry_initial_prompt) {
+                if (prompt_past0.empty()) {
+                    const int max_tokens = std::max(1, max_prompt_ctx - 1);
+                    if (params.prompt_n_tokens > max_tokens) {
+                        WHISPER_LOG_WARN("%s: initial prompt is too long (%d tokens), will use only the last %d tokens\n",
+                                        __func__, params.prompt_n_tokens, max_tokens);
+                    }
+                    const int n_tokens = std::min(params.prompt_n_tokens, max_tokens);
+                    prompt_past0.assign(params.prompt_tokens + (params.prompt_n_tokens - n_tokens), params.prompt_tokens + params.prompt_n_tokens);
+                }
+            } else {
+                for (int i = 0; i < params.prompt_n_tokens; ++i) {
+                    prompt_past1.push_back(params.prompt_tokens[i]);
+                }
+                std::rotate(prompt_past1.begin(), prompt_past1.end() - params.prompt_n_tokens, prompt_past1.end());
             }
-            std::rotate(prompt_past.begin(), prompt_past.end() - params.prompt_n_tokens, prompt_past.end());
         }
     }
@@ -7016,7 +7023,8 @@ int whisper_full_with_state(
         // if there is a very short audio segment left to process, we remove any past prompt since it tends
         // to confuse the decoder and often make it repeat or hallucinate stuff
         if (seek > seek_start && seek + 500 >= seek_end) {
-            prompt_past.clear();
+            prompt_past0.clear();
+            prompt_past1.clear();
         }
         int best_decoder_id = 0;
@@ -7077,12 +7085,25 @@ int whisper_full_with_state(
             {
                 prompt.clear();
-                // if we have already generated some text, use it as a prompt to condition the next generation
-                if (!prompt_past.empty() && t_cur < 0.5f && params.n_max_text_ctx > 0) {
-                    int n_take = std::min(std::min(params.n_max_text_ctx, whisper_n_text_ctx(ctx)/2), int(prompt_past.size()));
+                if (params.n_max_text_ctx > 0 && t_cur < WHISPER_HISTORY_CONDITIONING_TEMP_CUTOFF) {
+                    const bool can_take0 = params.carry_initial_prompt && !prompt_past0.empty();
+                    const bool can_take1 = !prompt_past1.empty();
-                    prompt = { whisper_token_prev(ctx) };
-                    prompt.insert(prompt.begin() + 1, prompt_past.end() - n_take, prompt_past.end());
+                    if (max_prompt_ctx > 0 && (can_take0 || can_take1)) {
+                        // Always start with previous token marker to connect continuity
+                        prompt.push_back(whisper_token_prev(ctx));
+                        // Take static tokens (initial prompt) first
+                        int n_take0 = 0;
+                        if (can_take0) {
+                            n_take0 = prompt_past0.size();
+                            prompt.insert(prompt.end(), prompt_past0.end() - n_take0, prompt_past0.end());
+                        }
+                        // Fill remaining budget with dynamic tokens (rolling context)
+                        const int n_take1 = std::min<int>(max_prompt_ctx - n_take0 - 1, prompt_past1.size());
+                        prompt.insert(prompt.end(), prompt_past1.end() - n_take1, prompt_past1.end());
+                    }
                 }
                 // init new transcription with sot, language (opt) and task tokens
@@ -7564,14 +7585,17 @@ int whisper_full_with_state(
             //WHISPER_LOG_DEBUG("prompt_init.size() = %d, prompt.size() = %d, result_len = %d, seek_delta = %d\n", prompt_init.size(), prompt.size(), result_len, seek_delta);
-            // update prompt_past
-            prompt_past.clear();
-            if (prompt.front() == whisper_token_prev(ctx)) {
-                prompt_past.insert(prompt_past.end(), prompt.begin() + 1, prompt.end() - prompt_init.size());
+            // update prompt_past1
+            prompt_past1.clear();
+            if (!params.carry_initial_prompt && !prompt.empty() && prompt.front() == whisper_token_prev(ctx)) {
+                prompt_past1.insert(prompt_past1.end(), prompt.begin() + 1, prompt.end() - prompt_init.size());
             }
-            for (int i = 0; i < result_len && !is_no_speech; ++i) {
-                prompt_past.push_back(tokens_cur[i].id);
+            // Add newly decoded tokens to the rolling context
+            if (!is_no_speech) {
+                for (int i = 0; i < result_len; ++i) {
+                    prompt_past1.push_back(tokens_cur[i].id);
+                }
             }
             if (!tokens_cur.empty() && ctx->model.n_loaded > 0 && !is_no_speech) {
@@ -8943,7 +8967,7 @@ void whisper_log_set(wsp_ggml_log_callback log_callback, void * user_data) {
 }
 const char * whisper_version(void) {
-    return "1.7.6";
+    return "1.8.0";
 }
 WSP_GGML_ATTRIBUTE_FORMAT(2, 3)

package/cpp/whisper.h CHANGED Viewed

@@ -526,6 +526,7 @@ extern "C" {
         // use whisper_tokenize() to convert text to tokens
         // maximum of whisper_n_text_ctx()/2 tokens are used (typically 224)
         const char * initial_prompt;
+        bool carry_initial_prompt; // if true, always prepend initial_prompt to every decode window (may reduce conditioning on previous text)
         const whisper_token * prompt_tokens;
         int prompt_n_tokens;

package/ios/CMakeLists.txt CHANGED Viewed

@@ -55,7 +55,12 @@ add_library(rnwhisper SHARED
     ${SOURCE_DIR}/ggml-cpu/binary-ops.cpp
     ${SOURCE_DIR}/ggml-cpu/vec.cpp
     ${SOURCE_DIR}/ggml-cpu/ops.cpp
-    ${SOURCE_DIR}/ggml-metal.m
+    ${SOURCE_DIR}/ggml-metal/ggml-metal.cpp
+    ${SOURCE_DIR}/ggml-metal/ggml-metal-common.cpp
+    ${SOURCE_DIR}/ggml-metal/ggml-metal-device.cpp
+    ${SOURCE_DIR}/ggml-metal/ggml-metal-context.m
+    ${SOURCE_DIR}/ggml-metal/ggml-metal-device.m
+    ${SOURCE_DIR}/ggml-metal/ggml-metal-ops.cpp
     ${SOURCE_DIR}/ggml-opt.cpp
     ${SOURCE_DIR}/ggml-threading.cpp
     ${SOURCE_DIR}/ggml-quants.c

package/ios/RNWhisper.mm CHANGED Viewed

@@ -357,10 +357,9 @@ RCT_REMAP_METHOD(releaseContext,
         reject(@"whisper_error", @"Context not found", nil);
         return;
     }
-    [context invalidate];
-    [contexts removeObjectForKey:[NSNumber numberWithInt:contextId]];
-    // Also remove from unified context management
     rnwhisper_jsi::removeContext(contextId);
+    [contexts removeObjectForKey:[NSNumber numberWithInt:contextId]];
+    [context invalidate];
     resolve(nil);
 }
@@ -555,10 +554,9 @@ RCT_REMAP_METHOD(releaseVadContext,
         reject(@"whisper_vad_error", @"VAD context not found", nil);
         return;
     }
-    [vadContext invalidate];
-    [vadContexts removeObjectForKey:[NSNumber numberWithInt:contextId]];
-    // Also remove from unified context management
     rnwhisper_jsi::removeVadContext(contextId);
+    [vadContexts removeObjectForKey:[NSNumber numberWithInt:contextId]];
+    [vadContext invalidate];
     resolve(nil);
 }
@@ -574,6 +572,7 @@ RCT_EXPORT_METHOD(releaseAllVadContexts:(RCTPromiseResolveBlock)resolve
     if (contexts != nil) {
         for (NSNumber *contextId in contexts) {
             RNWhisperContext *context = contexts[contextId];
+            rnwhisper_jsi::removeContext([contextId intValue]);
             [context invalidate];
         }
         [contexts removeAllObjects];
@@ -585,6 +584,7 @@ RCT_EXPORT_METHOD(releaseAllVadContexts:(RCTPromiseResolveBlock)resolve
     if (vadContexts != nil) {
         for (NSNumber *contextId in vadContexts) {
             RNWhisperVadContext *vadContext = vadContexts[contextId];
+            rnwhisper_jsi::removeVadContext([contextId intValue]);
             [vadContext invalidate];
         }
         [vadContexts removeAllObjects];

package/ios/RNWhisperContext.mm CHANGED Viewed

@@ -87,6 +87,8 @@ static void* retained_log_block = nullptr;
 #ifdef WSP_GGML_USE_METAL
     if (cparams.use_gpu) {
+        cparams.gpu_device = 0;
         id<MTLDevice> device = MTLCreateSystemDefaultDevice();
         // Check ggml-metal availability

package/ios/RNWhisperVadContext.mm CHANGED Viewed

@@ -20,25 +20,28 @@
 #ifdef WSP_GGML_USE_METAL
     if (ctx_params.use_gpu) {
-        id<MTLDevice> device = MTLCreateSystemDefaultDevice();
-        // Check ggml-metal availability
-        BOOL supportsGgmlMetal = [device supportsFamily:MTLGPUFamilyApple7];
-        if (@available(iOS 16.0, tvOS 16.0, *)) {
-            supportsGgmlMetal = supportsGgmlMetal && [device supportsFamily:MTLGPUFamilyMetal3];
-        }
-        if (!supportsGgmlMetal) {
-          ctx_params.use_gpu = false;
-            reasonNoMetal = @"Metal is not supported in this device";
-        }
+        // TODO: GPU VAD is forced disabled until the performance is improved (ref: whisper.cpp/whisper_vad_init_context)
+        ctx_params.use_gpu = false;
+        // ctx_params.gpu_device = 0;
+        // id<MTLDevice> device = MTLCreateSystemDefaultDevice();
+        // // Check ggml-metal availability
+        // BOOL supportsGgmlMetal = [device supportsFamily:MTLGPUFamilyApple7];
+        // if (@available(iOS 16.0, tvOS 16.0, *)) {
+        //     supportsGgmlMetal = supportsGgmlMetal && [device supportsFamily:MTLGPUFamilyMetal3];
+        // }
+        // if (!supportsGgmlMetal) {
+        //   ctx_params.use_gpu = false;
+        //     reasonNoMetal = @"Metal is not supported in this device";
+        // }
+        // device = nil;
 #if TARGET_OS_SIMULATOR
         // Use the backend, but no layers because not supported fully on simulator
         ctx_params.use_gpu = false;
         reasonNoMetal = @"Metal is not supported in simulator";
 #endif
-        device = nil;
     }
 #endif // WSP_GGML_USE_METAL

package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/ggml-backend-impl.h CHANGED Viewed

@@ -8,7 +8,7 @@
 extern "C" {
 #endif
-    #define WSP_GGML_BACKEND_API_VERSION 1
+    #define WSP_GGML_BACKEND_API_VERSION 2
     //
     // Backend buffer type
@@ -114,6 +114,9 @@ extern "C" {
         void (*event_record)(wsp_ggml_backend_t backend, wsp_ggml_backend_event_t event);
         // wait for an event on on a different stream
         void (*event_wait)  (wsp_ggml_backend_t backend, wsp_ggml_backend_event_t event);
+        // (optional) sort/optimize the nodes in the graph
+        void                      (*graph_optimize)    (wsp_ggml_backend_t backend, struct wsp_ggml_cgraph * cgraph);
     };
     struct wsp_ggml_backend {

package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/ggml-backend.h CHANGED Viewed

@@ -132,6 +132,8 @@ extern "C" {
         WSP_GGML_BACKEND_DEVICE_TYPE_CPU,
         // GPU device using dedicated memory
         WSP_GGML_BACKEND_DEVICE_TYPE_GPU,
+        // integrated GPU device using host memory
+        WSP_GGML_BACKEND_DEVICE_TYPE_IGPU,
         // accelerator devices intended to be used together with the CPU backend (e.g. BLAS or AMX)
         WSP_GGML_BACKEND_DEVICE_TYPE_ACCEL
     };
@@ -150,11 +152,21 @@ extern "C" {
     // all the device properties
     struct wsp_ggml_backend_dev_props {
+        // device name
         const char * name;
+        // device description
         const char * description;
+        // device free memory in bytes
         size_t memory_free;
+        // device total memory in bytes
         size_t memory_total;
+        // device type
         enum wsp_ggml_backend_dev_type type;
+        // device id
+        //   for PCI devices, this should be the PCI bus id formatted as "domain:bus:device.function" (e.g. "0000:01:00.0")
+        //   if the id is unknown, this should be NULL
+        const char * device_id;
+        // device capabilities
         struct wsp_ggml_backend_dev_caps caps;
     };
@@ -302,11 +314,15 @@ extern "C" {
     WSP_GGML_API int                  wsp_ggml_backend_sched_get_n_splits(wsp_ggml_backend_sched_t sched);
     WSP_GGML_API int                  wsp_ggml_backend_sched_get_n_copies(wsp_ggml_backend_sched_t sched);
-    WSP_GGML_API size_t               wsp_ggml_backend_sched_get_buffer_size(wsp_ggml_backend_sched_t sched, wsp_ggml_backend_t backend);
+    WSP_GGML_API wsp_ggml_backend_buffer_type_t wsp_ggml_backend_sched_get_buffer_type(wsp_ggml_backend_sched_t sched, wsp_ggml_backend_t backend);
+    WSP_GGML_API size_t                     wsp_ggml_backend_sched_get_buffer_size(wsp_ggml_backend_sched_t sched, wsp_ggml_backend_t backend);
     WSP_GGML_API void                 wsp_ggml_backend_sched_set_tensor_backend(wsp_ggml_backend_sched_t sched, struct wsp_ggml_tensor * node, wsp_ggml_backend_t backend);
     WSP_GGML_API wsp_ggml_backend_t       wsp_ggml_backend_sched_get_tensor_backend(wsp_ggml_backend_sched_t sched, struct wsp_ggml_tensor * node);
+    // Split graph without allocating it
+    WSP_GGML_API void                 wsp_ggml_backend_sched_split_graph(wsp_ggml_backend_sched_t sched, struct wsp_ggml_cgraph * graph);
     // Allocate and compute graph on the backend scheduler
     WSP_GGML_API bool                 wsp_ggml_backend_sched_alloc_graph(wsp_ggml_backend_sched_t sched, struct wsp_ggml_cgraph * graph); // returns success
     WSP_GGML_API enum wsp_ggml_status     wsp_ggml_backend_sched_graph_compute(wsp_ggml_backend_sched_t sched, struct wsp_ggml_cgraph * graph);

package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/ggml-common.h CHANGED Viewed

@@ -99,6 +99,9 @@ typedef sycl::half2 wsp_ggml_half2;
 #define QI4_1 (QK4_1 / (4 * QR4_1))
 #define QR4_1 2
+#define QI_MXFP4 (QK_MXFP4 / (4 * QR_MXFP4))
+#define QR_MXFP4 2
 #define QI5_0 (QK5_0 / (4 * QR5_0))
 #define QR5_0 2
@@ -184,6 +187,13 @@ typedef struct {
 } block_q4_1;
 static_assert(sizeof(block_q4_1) == 2 * sizeof(wsp_ggml_half) + QK4_1 / 2, "wrong q4_1 block size/padding");
+#define QK_MXFP4 32
+typedef struct {
+    uint8_t e; // E8M0
+    uint8_t qs[QK_MXFP4/2];
+} block_mxfp4;
+static_assert(sizeof(block_mxfp4) == sizeof(uint8_t) + QK_MXFP4/2, "wrong mxfp4 block size/padding");
 #define QK5_0 32
 typedef struct {
     wsp_ggml_half d;           // delta
@@ -1074,10 +1084,17 @@ WSP_GGML_TABLE_BEGIN(uint32_t, iq3s_grid, 512)
     0x0f090307, 0x0f090501, 0x0f090b01, 0x0f0b0505, 0x0f0b0905, 0x0f0d0105, 0x0f0d0703, 0x0f0f0101,
 WSP_GGML_TABLE_END()
+// TODO: fix name to kvalues_iq4_nl
 WSP_GGML_TABLE_BEGIN(int8_t, kvalues_iq4nl, 16)
     -127, -104, -83, -65, -49, -35, -22, -10, 1, 13, 25, 38, 53, 69, 89, 113,
 WSP_GGML_TABLE_END()
+// e2m1 values (doubled)
+// ref: https://www.opencompute.org/documents/ocp-microscaling-formats-mx-v1-0-spec-final-pdf
+WSP_GGML_TABLE_BEGIN(int8_t, kvalues_mxfp4, 16)
+    0, 1, 2, 3, 4, 6, 8, 12, 0, -1, -2, -3, -4, -6, -8, -12,
+WSP_GGML_TABLE_END()
 #define NGRID_IQ1S 2048
 #define IQ1S_DELTA 0.125f
 #define IQ1M_DELTA 0.125f

package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/ggml-cpu.h CHANGED Viewed

@@ -101,7 +101,6 @@ extern "C" {
     WSP_GGML_BACKEND_API int wsp_ggml_cpu_has_riscv_v    (void);
     WSP_GGML_BACKEND_API int wsp_ggml_cpu_has_vsx        (void);
     WSP_GGML_BACKEND_API int wsp_ggml_cpu_has_vxe        (void);
-    WSP_GGML_BACKEND_API int wsp_ggml_cpu_has_nnpa       (void);
     WSP_GGML_BACKEND_API int wsp_ggml_cpu_has_wasm_simd  (void);
     WSP_GGML_BACKEND_API int wsp_ggml_cpu_has_llamafile  (void);
@@ -135,6 +134,7 @@ extern "C" {
     WSP_GGML_BACKEND_API wsp_ggml_backend_reg_t wsp_ggml_backend_cpu_reg(void);
     WSP_GGML_BACKEND_API void wsp_ggml_cpu_fp32_to_fp32(const float *,       float *, int64_t);
+    WSP_GGML_BACKEND_API void wsp_ggml_cpu_fp32_to_i32 (const float *,     int32_t *, int64_t);
     WSP_GGML_BACKEND_API void wsp_ggml_cpu_fp32_to_fp16(const float *, wsp_ggml_fp16_t *, int64_t);
     WSP_GGML_BACKEND_API void wsp_ggml_cpu_fp16_to_fp32(const wsp_ggml_fp16_t *, float *, int64_t);
     WSP_GGML_BACKEND_API void wsp_ggml_cpu_fp32_to_bf16(const float *, wsp_ggml_bf16_t *, int64_t);