npm - whisper.rn - Versions diffs - 0.5.0 → 0.5.1 - Mend

whisper.rn 0.5.0 → 0.5.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (99) hide show

package/cpp/ggml.h CHANGED Viewed

@@ -244,6 +244,13 @@
 #define WSP_GGML_MROPE_SECTIONS   4
 #define WSP_GGML_UNUSED(x) (void)(x)
+#ifdef __CUDACC__
+template<typename... Args>
+__host__ __device__ constexpr inline void wsp_ggml_unused_vars_impl(Args&&...) noexcept {}
+#define WSP_GGML_UNUSED_VARS(...) wsp_ggml_unused_vars_impl(__VA_ARGS__)
+#else
+#define WSP_GGML_UNUSED_VARS(...) do { (void)sizeof((__VA_ARGS__, 0)); } while(0)
+#endif // __CUDACC__
 #define WSP_GGML_PAD(x, n) (((x) + (n) - 1) & ~((n) - 1))
@@ -277,19 +284,19 @@
 //    WSP_GGML_TENSOR_LOCALS(size_t,  nb1, src1, nb);
 //
 #define WSP_GGML_TENSOR_LOCALS_1(type, prefix, pointer, array) \
-    const type prefix##0 = (pointer)->array[0]; \
+    const type prefix##0 = (pointer) ? (pointer)->array[0] : 0; \
     WSP_GGML_UNUSED(prefix##0);
 #define WSP_GGML_TENSOR_LOCALS_2(type, prefix, pointer, array) \
     WSP_GGML_TENSOR_LOCALS_1    (type, prefix, pointer, array) \
-    const type prefix##1 = (pointer)->array[1]; \
+    const type prefix##1 = (pointer) ? (pointer)->array[1] : 0; \
     WSP_GGML_UNUSED(prefix##1);
 #define WSP_GGML_TENSOR_LOCALS_3(type, prefix, pointer, array) \
     WSP_GGML_TENSOR_LOCALS_2    (type, prefix, pointer, array) \
-    const type prefix##2 = (pointer)->array[2]; \
+    const type prefix##2 = (pointer) ? (pointer)->array[2] : 0; \
     WSP_GGML_UNUSED(prefix##2);
 #define WSP_GGML_TENSOR_LOCALS(type, prefix, pointer, array) \
     WSP_GGML_TENSOR_LOCALS_3  (type, prefix, pointer, array) \
-    const type prefix##3 = (pointer)->array[3]; \
+    const type prefix##3 = (pointer) ? (pointer)->array[3] : 0; \
     WSP_GGML_UNUSED(prefix##3);
 #define WSP_GGML_TENSOR_UNARY_OP_LOCALS \
@@ -504,7 +511,9 @@ extern "C" {
         WSP_GGML_OP_CONV_TRANSPOSE_1D,
         WSP_GGML_OP_IM2COL,
         WSP_GGML_OP_IM2COL_BACK,
+        WSP_GGML_OP_IM2COL_3D,
         WSP_GGML_OP_CONV_2D,
+        WSP_GGML_OP_CONV_3D,
         WSP_GGML_OP_CONV_2D_DW,
         WSP_GGML_OP_CONV_TRANSPOSE_2D,
         WSP_GGML_OP_POOL_1D,
@@ -1395,6 +1404,7 @@ extern "C" {
             struct wsp_ggml_tensor  * a,
             struct wsp_ggml_tensor  * b);
+    // note: casting from f32 to i32 will discard the fractional part
     WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_cast(
             struct wsp_ggml_context * ctx,
             struct wsp_ggml_tensor  * a,
@@ -1519,7 +1529,11 @@ extern "C" {
             struct wsp_ggml_context * ctx,
             struct wsp_ggml_tensor  * a);
-    // supports 3D: a->ne[2] == b->ne[1]
+    // supports 4D a:
+    // a     [n_embd, ne1, ne2, ne3]
+    // b I32 [n_rows, ne2, ne3, 1]
+    //
+    // return [n_embd, n_rows, ne2, ne3]
     WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_get_rows(
             struct wsp_ggml_context * ctx,
             struct wsp_ggml_tensor  * a,  // data
@@ -1862,6 +1876,41 @@ extern "C" {
             int                   d0,  // dilation dimension 0
             int                   d1); // dilation dimension 1
+    WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_im2col_3d(
+            struct wsp_ggml_context * ctx,
+            struct wsp_ggml_tensor  * a,
+            struct wsp_ggml_tensor  * b,
+            int64_t               IC,
+            int                   s0, // stride width
+            int                   s1, // stride height
+            int                   s2, // stride depth
+            int                   p0, // padding width
+            int                   p1, // padding height
+            int                   p2, // padding depth
+            int                   d0, // dilation width
+            int                   d1, // dilation height
+            int                   d2, // dilation depth
+            enum wsp_ggml_type        dst_type);
+    // a: [OC*IC, KD, KH, KW]
+    // b: [N*IC, ID, IH, IW]
+    // result: [N*OC, OD, OH, OW]
+    WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_conv_3d(
+                struct wsp_ggml_context * ctx,
+                struct wsp_ggml_tensor  * a,
+                struct wsp_ggml_tensor  * b,
+                int64_t               IC,
+                int                   s0, // stride width
+                int                   s1, // stride height
+                int                   s2, // stride depth
+                int                   p0, // padding width
+                int                   p1, // padding height
+                int                   p2, // padding depth
+                int                   d0, // dilation width
+                int                   d1, // dilation height
+                int                   d2  // dilation depth
+        );
     // kernel size is a->ne[0] x a->ne[1]
     // stride is equal to kernel size
     // padding is zero
@@ -1933,6 +1982,23 @@ extern "C" {
             int                   d0,  // dilation dimension 0
             int                   d1); // dilation dimension 1
+    WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_conv_3d_direct(
+            struct wsp_ggml_context * ctx,
+            struct wsp_ggml_tensor  * a,   // kernel [KW, KH, KD, IC * OC]
+            struct wsp_ggml_tensor  * b,   // input  [W, H, D, C * N]
+            int                   s0,  // stride
+            int                   s1,
+            int                   s2,
+            int                   p0,  // padding
+            int                   p1,
+            int                   p2,
+            int                   d0,  // dilation
+            int                   d1,
+            int                   d2,
+            int                   n_channels,
+            int                   n_batch,
+            int                   n_channels_out);
     enum wsp_ggml_op_pool {
         WSP_GGML_OP_POOL_MAX,
         WSP_GGML_OP_POOL_AVG,
@@ -2023,6 +2089,19 @@ extern "C" {
             int                  p2,
             int                  p3);
+    WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_pad_ext(
+            struct wsp_ggml_context * ctx,
+            struct wsp_ggml_tensor  * a,
+            int                  lp0,
+            int                  rp0,
+            int                  lp1,
+            int                  rp1,
+            int                  lp2,
+            int                  rp2,
+            int                  lp3,
+            int                  rp3
+            );
     // pad each dimension with reflection: [a, b, c, d] -> [b, a, b, c, d, c]
     WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_pad_reflect_1d(
             struct wsp_ggml_context * ctx,

package/cpp/jsi/RNWhisperJSI.cpp CHANGED Viewed

@@ -17,6 +17,8 @@ using namespace facebook::jsi;
 namespace rnwhisper_jsi {
+using namespace facebook::jsi;
 // Consolidated logging function
 enum class LogLevel { LOG_DEBUG, LOG_INFO, LOG_ERROR };

package/cpp/jsi/ThreadPool.h CHANGED Viewed

@@ -18,7 +18,7 @@ public:
     ThreadPool(size_t);
     template<class F, class... Args>
     auto enqueue(F&& f, Args&&... args)
-        -> std::future<typename std::result_of<F(Args...)>::type>;
+        -> std::future<std::invoke_result_t<F, Args...>>;
     ~ThreadPool();
 private:
     // need to keep track of threads so we can join them
@@ -63,9 +63,9 @@ inline ThreadPool::ThreadPool(size_t threads)
 // add new work item to the pool
 template<class F, class... Args>
 auto ThreadPool::enqueue(F&& f, Args&&... args)
-    -> std::future<typename std::result_of<F(Args...)>::type>
+    -> std::future<std::invoke_result_t<F, Args...>>
 {
-    using return_type = typename std::result_of<F(Args...)>::type;
+    using return_type = std::invoke_result_t<F, Args...>;
     auto task = std::make_shared< std::packaged_task<return_type()> >(
             std::bind(std::forward<F>(f), std::forward<Args>(args)...)

package/cpp/whisper.cpp CHANGED Viewed

@@ -21,14 +21,12 @@
 #define _USE_MATH_DEFINES
 #include <cmath>
 #include <climits>
-#include <codecvt>
 #include <cstdarg>
 #include <cstdio>
 #include <cstring>
 #include <fstream>
 #include <functional>
 #include <map>
-#include <mutex>
 #include <random>
 #include <regex>
 #include <set>
@@ -36,6 +34,10 @@
 #include <thread>
 #include <vector>
+#ifdef _MSC_VER
+#include <codecvt>
+#endif
 #if defined(WHISPER_BIG_ENDIAN)
 template<typename T>
 static T byteswap(T value) {
@@ -138,6 +140,10 @@ static void whisper_log_callback_default(wsp_ggml_log_level level, const char *
     } while (0)
 #define WHISPER_MAX_DECODERS 8
+// temperature below which we condition on past text history
+static constexpr float WHISPER_HISTORY_CONDITIONING_TEMP_CUTOFF = 0.5f;
 #define WHISPER_MAX_NODES 4096
 static std::string format(const char * fmt, ...) {
@@ -252,45 +258,6 @@ static void whisper_set_i32_nd(struct wsp_ggml_tensor * t, int64_t i0, int64_t i
     *(int32_t *) data = v;
 }
-// faster matrix multiplications for tensors that do not have dimension 0 divisible by "pad"
-// the idea is to represent the original matrix multiplication:
-//
-//   Z = X @ Y
-//
-// with the sum of two matrix multiplications:
-//
-//   Z = (X_0 @ Y_0) + (X_1 @ Y_1)
-//
-// here X_0 and Y_0 are views of X and Y that have dimension 0 divisible by "pad"
-// and X_1 and Y_1 are the remaining views. X_1 and Y_1 end up being small matrices that can be processed with more
-// general-purpose kernels
-//
-static struct wsp_ggml_tensor * wsp_ggml_mul_mat_pad(struct wsp_ggml_context * ctx, struct wsp_ggml_tensor * x, struct wsp_ggml_tensor * y, int pad = 32) {
-    // use padding only if dimension 0 is at least 8 times larger than the padding
-    // else we won't get much benefit from the optimization
-    const int n_pad_req = 8;
-    if (x->ne[0] % pad == 0 || x->ne[0] / pad < n_pad_req) {
-        return wsp_ggml_mul_mat(ctx, x, y);
-    }
-    struct wsp_ggml_tensor * x_0 = wsp_ggml_view_3d(ctx, x, (x->ne[0]/pad)*pad, x->ne[1], x->ne[2], x->nb[1], x->nb[2], 0);
-    struct wsp_ggml_tensor * x_1 = wsp_ggml_view_3d(ctx, x,  x->ne[0]%pad,      x->ne[1], x->ne[2], x->nb[1], x->nb[2], x_0->ne[0]*x_0->nb[0]);
-    struct wsp_ggml_tensor * y_0 = wsp_ggml_view_3d(ctx, y, (y->ne[0]/pad)*pad, y->ne[1], y->ne[2], y->nb[1], y->nb[2], 0);
-    struct wsp_ggml_tensor * y_1 = wsp_ggml_view_3d(ctx, y,  y->ne[0]%pad,      y->ne[1], y->ne[2], y->nb[1], y->nb[2], y_0->ne[0]*y_0->nb[0]);
-    return wsp_ggml_add(ctx,
-            wsp_ggml_mul_mat(ctx, x_0, y_0),
-            wsp_ggml_mul_mat(ctx, x_1, y_1));
-}
-// TODO: check if other platforms can benefit from this optimization
-// TODO: CUDA is currently broken - seems wsp_ggml_mul_mat does not handle views correctly
-#if defined(WSP_GGML_USE_METAL)
-#define wsp_ggml_mul_mat wsp_ggml_mul_mat_pad
-#endif
 // available whisper models
 enum e_model {
     MODEL_UNKNOWN,
@@ -919,7 +886,10 @@ struct whisper_state {
     std::vector<float> logits;
     std::vector<whisper_segment> result_all;
-    std::vector<whisper_token>   prompt_past;
+    // prompt history split into static prefix (prompt_past0) and dynamic rolling context (prompt_past1)
+    std::vector<whisper_token>   prompt_past0; // static carried initial prompt (if enabled)
+    std::vector<whisper_token>   prompt_past1; // dynamic context from decoded output
     int lang_id = 0; // english by default
@@ -3635,7 +3605,7 @@ struct whisper_context_params whisper_context_default_params() {
     struct whisper_context_params result = {
         /*.use_gpu              =*/ true,
         /*.use_coreml           =*/ false,
-        /*.flash_attn           =*/ false,
+        /*.flash_attn           =*/ true,
         /*.gpu_device           =*/ 0,
         /*.dtw_token_timestamps =*/ false,
@@ -4719,6 +4689,7 @@ static bool whisper_vad_init_context(whisper_vad_context * vctx) {
     wsp_ggml_set_name(vctx->c_state, "c_state");
     vctx->buffer = wsp_ggml_backend_alloc_ctx_tensors(ctx, vctx->backends[0]);
+    wsp_ggml_free(ctx);
     if (!vctx->buffer) {
         WHISPER_LOG_ERROR("%s: failed to allocate memory for the VAD state\n", __func__);
         return false;
@@ -5463,6 +5434,9 @@ struct whisper_vad_segments * whisper_vad_segments_from_samples(
 void whisper_vad_free(whisper_vad_context * ctx) {
     if (ctx) {
+        if (ctx->buffer) {
+            wsp_ggml_backend_buffer_free(ctx->buffer);
+        }
         for (wsp_ggml_context * context : ctx->model.ctxs) {
             wsp_ggml_free(context);
         }
@@ -5477,6 +5451,9 @@ void whisper_vad_free(whisper_vad_context * ctx) {
             wsp_ggml_backend_free(backend);
         }
+        delete[] ctx->model.hparams.encoder_in_channels;
+        delete[] ctx->model.hparams.encoder_out_channels;
+        delete[] ctx->model.hparams.kernel_sizes;
         delete ctx;
     }
@@ -5956,9 +5933,10 @@ struct whisper_full_params whisper_full_default_params(enum whisper_sampling_str
         /* suppress_regex    =*/ nullptr,
-        /*.initial_prompt    =*/ nullptr,
-        /*.prompt_tokens     =*/ nullptr,
-        /*.prompt_n_tokens   =*/ 0,
+        /*.initial_prompt       =*/ nullptr,
+        /*.carry_initial_prompt =*/ false,
+        /*.prompt_tokens        =*/ nullptr,
+        /*.prompt_n_tokens      =*/ 0,
         /*.language          =*/ "en",
         /*.detect_language   =*/ false,
@@ -6654,6 +6632,10 @@ static bool whisper_vad(
     whisper_vad_segments * vad_segments = whisper_vad_segments_from_samples(vctx, vad_params, samples, n_samples);
+    if (!vad_segments) {
+        return false;
+    }
     if (vad_segments->data.size() > 0) {
         state->has_vad_segments = true;
         ctx->state->vad_segments.clear();
@@ -6696,7 +6678,6 @@ static bool whisper_vad(
         } catch (const std::bad_alloc & /* e */) {
             WHISPER_LOG_ERROR("%s: failed to allocate memory for filtered samples\n", __func__);
             whisper_vad_free_segments(vad_segments);
-            whisper_vad_free(vctx);
             return false;
         }
@@ -6802,6 +6783,7 @@ static bool whisper_vad(
                         __func__, n_samples, filtered_n_samples, 100.0f * (1.0f - (float)filtered_n_samples / n_samples));
     }
+    whisper_vad_free_segments(vad_segments);
     return true;
 }
@@ -6910,17 +6892,22 @@ int whisper_full_with_state(
         decoder.rng = std::mt19937(j);
     }
-    // the accumulated text context so far
-    auto & prompt_past = state->prompt_past;
+    // the accumulated text context split into static (prompt_past0) and dynamic (prompt_past1)
+    auto & prompt_past0 = state->prompt_past0;
+    auto & prompt_past1 = state->prompt_past1;
     if (params.no_context) {
-        prompt_past.clear();
+        prompt_past0.clear();
+        prompt_past1.clear();
     }
+    // calculate the maximum context budget for prompt history
+    const int max_prompt_ctx = std::min(params.n_max_text_ctx, whisper_n_text_ctx(ctx)/2);
     // prepare prompt
     {
         std::vector<whisper_token> prompt_tokens;
-        // initial prompt
+        // tokenize the initial prompt
         if (!params.prompt_tokens && params.initial_prompt) {
             prompt_tokens.resize(1024);
             int n_needed = whisper_tokenize(ctx, params.initial_prompt, prompt_tokens.data(), prompt_tokens.size());
@@ -6932,14 +6919,25 @@ int whisper_full_with_state(
             params.prompt_tokens   = prompt_tokens.data();
             params.prompt_n_tokens = prompt_tokens.size();
         }
-        // prepend the prompt tokens to the prompt_past
         if (params.prompt_tokens && params.prompt_n_tokens > 0) {
-            // parse tokens from the pointer
-            for (int i = 0; i < params.prompt_n_tokens; i++) {
-                prompt_past.push_back(params.prompt_tokens[i]);
+            if (params.carry_initial_prompt) {
+                if (prompt_past0.empty()) {
+                    const int max_tokens = std::max(1, max_prompt_ctx - 1);
+                    if (params.prompt_n_tokens > max_tokens) {
+                        WHISPER_LOG_WARN("%s: initial prompt is too long (%d tokens), will use only the last %d tokens\n",
+                                        __func__, params.prompt_n_tokens, max_tokens);
+                    }
+                    const int n_tokens = std::min(params.prompt_n_tokens, max_tokens);
+                    prompt_past0.assign(params.prompt_tokens + (params.prompt_n_tokens - n_tokens), params.prompt_tokens + params.prompt_n_tokens);
+                }
+            } else {
+                for (int i = 0; i < params.prompt_n_tokens; ++i) {
+                    prompt_past1.push_back(params.prompt_tokens[i]);
+                }
+                std::rotate(prompt_past1.begin(), prompt_past1.end() - params.prompt_n_tokens, prompt_past1.end());
             }
-            std::rotate(prompt_past.begin(), prompt_past.end() - params.prompt_n_tokens, prompt_past.end());
         }
     }
@@ -7025,7 +7023,8 @@ int whisper_full_with_state(
         // if there is a very short audio segment left to process, we remove any past prompt since it tends
         // to confuse the decoder and often make it repeat or hallucinate stuff
         if (seek > seek_start && seek + 500 >= seek_end) {
-            prompt_past.clear();
+            prompt_past0.clear();
+            prompt_past1.clear();
         }
         int best_decoder_id = 0;
@@ -7086,12 +7085,25 @@ int whisper_full_with_state(
             {
                 prompt.clear();
-                // if we have already generated some text, use it as a prompt to condition the next generation
-                if (!prompt_past.empty() && t_cur < 0.5f && params.n_max_text_ctx > 0) {
-                    int n_take = std::min(std::min(params.n_max_text_ctx, whisper_n_text_ctx(ctx)/2), int(prompt_past.size()));
+                if (params.n_max_text_ctx > 0 && t_cur < WHISPER_HISTORY_CONDITIONING_TEMP_CUTOFF) {
+                    const bool can_take0 = params.carry_initial_prompt && !prompt_past0.empty();
+                    const bool can_take1 = !prompt_past1.empty();
-                    prompt = { whisper_token_prev(ctx) };
-                    prompt.insert(prompt.begin() + 1, prompt_past.end() - n_take, prompt_past.end());
+                    if (max_prompt_ctx > 0 && (can_take0 || can_take1)) {
+                        // Always start with previous token marker to connect continuity
+                        prompt.push_back(whisper_token_prev(ctx));
+                        // Take static tokens (initial prompt) first
+                        int n_take0 = 0;
+                        if (can_take0) {
+                            n_take0 = prompt_past0.size();
+                            prompt.insert(prompt.end(), prompt_past0.end() - n_take0, prompt_past0.end());
+                        }
+                        // Fill remaining budget with dynamic tokens (rolling context)
+                        const int n_take1 = std::min<int>(max_prompt_ctx - n_take0 - 1, prompt_past1.size());
+                        prompt.insert(prompt.end(), prompt_past1.end() - n_take1, prompt_past1.end());
+                    }
                 }
                 // init new transcription with sot, language (opt) and task tokens
@@ -7573,14 +7585,17 @@ int whisper_full_with_state(
             //WHISPER_LOG_DEBUG("prompt_init.size() = %d, prompt.size() = %d, result_len = %d, seek_delta = %d\n", prompt_init.size(), prompt.size(), result_len, seek_delta);
-            // update prompt_past
-            prompt_past.clear();
-            if (prompt.front() == whisper_token_prev(ctx)) {
-                prompt_past.insert(prompt_past.end(), prompt.begin() + 1, prompt.end() - prompt_init.size());
+            // update prompt_past1
+            prompt_past1.clear();
+            if (!params.carry_initial_prompt && !prompt.empty() && prompt.front() == whisper_token_prev(ctx)) {
+                prompt_past1.insert(prompt_past1.end(), prompt.begin() + 1, prompt.end() - prompt_init.size());
             }
-            for (int i = 0; i < result_len && !is_no_speech; ++i) {
-                prompt_past.push_back(tokens_cur[i].id);
+            // Add newly decoded tokens to the rolling context
+            if (!is_no_speech) {
+                for (int i = 0; i < result_len; ++i) {
+                    prompt_past1.push_back(tokens_cur[i].id);
+                }
             }
             if (!tokens_cur.empty() && ctx->model.n_loaded > 0 && !is_no_speech) {
@@ -8952,7 +8967,7 @@ void whisper_log_set(wsp_ggml_log_callback log_callback, void * user_data) {
 }
 const char * whisper_version(void) {
-    return "1.7.6";
+    return "1.8.0";
 }
 WSP_GGML_ATTRIBUTE_FORMAT(2, 3)

package/cpp/whisper.h CHANGED Viewed

@@ -526,6 +526,7 @@ extern "C" {
         // use whisper_tokenize() to convert text to tokens
         // maximum of whisper_n_text_ctx()/2 tokens are used (typically 224)
         const char * initial_prompt;
+        bool carry_initial_prompt; // if true, always prepend initial_prompt to every decode window (may reduce conditioning on previous text)
         const whisper_token * prompt_tokens;
         int prompt_n_tokens;

package/ios/CMakeLists.txt CHANGED Viewed

@@ -55,7 +55,12 @@ add_library(rnwhisper SHARED
     ${SOURCE_DIR}/ggml-cpu/binary-ops.cpp
     ${SOURCE_DIR}/ggml-cpu/vec.cpp
     ${SOURCE_DIR}/ggml-cpu/ops.cpp
-    ${SOURCE_DIR}/ggml-metal.m
+    ${SOURCE_DIR}/ggml-metal/ggml-metal.cpp
+    ${SOURCE_DIR}/ggml-metal/ggml-metal-common.cpp
+    ${SOURCE_DIR}/ggml-metal/ggml-metal-device.cpp
+    ${SOURCE_DIR}/ggml-metal/ggml-metal-context.m
+    ${SOURCE_DIR}/ggml-metal/ggml-metal-device.m
+    ${SOURCE_DIR}/ggml-metal/ggml-metal-ops.cpp
     ${SOURCE_DIR}/ggml-opt.cpp
     ${SOURCE_DIR}/ggml-threading.cpp
     ${SOURCE_DIR}/ggml-quants.c

package/ios/RNWhisperVadContext.mm CHANGED Viewed

@@ -20,27 +20,28 @@
 #ifdef WSP_GGML_USE_METAL
     if (ctx_params.use_gpu) {
-        ctx_params.gpu_device = 0;
+        // TODO: GPU VAD is forced disabled until the performance is improved (ref: whisper.cpp/whisper_vad_init_context)
+        ctx_params.use_gpu = false;
+        // ctx_params.gpu_device = 0;
-        id<MTLDevice> device = MTLCreateSystemDefaultDevice();
+        // id<MTLDevice> device = MTLCreateSystemDefaultDevice();
-        // Check ggml-metal availability
-        BOOL supportsGgmlMetal = [device supportsFamily:MTLGPUFamilyApple7];
-        if (@available(iOS 16.0, tvOS 16.0, *)) {
-            supportsGgmlMetal = supportsGgmlMetal && [device supportsFamily:MTLGPUFamilyMetal3];
-        }
-        if (!supportsGgmlMetal) {
-          ctx_params.use_gpu = false;
-            reasonNoMetal = @"Metal is not supported in this device";
-        }
+        // // Check ggml-metal availability
+        // BOOL supportsGgmlMetal = [device supportsFamily:MTLGPUFamilyApple7];
+        // if (@available(iOS 16.0, tvOS 16.0, *)) {
+        //     supportsGgmlMetal = supportsGgmlMetal && [device supportsFamily:MTLGPUFamilyMetal3];
+        // }
+        // if (!supportsGgmlMetal) {
+        //   ctx_params.use_gpu = false;
+        //     reasonNoMetal = @"Metal is not supported in this device";
+        // }
+        // device = nil;
 #if TARGET_OS_SIMULATOR
         // Use the backend, but no layers because not supported fully on simulator
         ctx_params.use_gpu = false;
         reasonNoMetal = @"Metal is not supported in simulator";
 #endif
-        device = nil;
     }
 #endif // WSP_GGML_USE_METAL

package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/ggml-backend-impl.h CHANGED Viewed

@@ -8,7 +8,7 @@
 extern "C" {
 #endif
-    #define WSP_GGML_BACKEND_API_VERSION 1
+    #define WSP_GGML_BACKEND_API_VERSION 2
     //
     // Backend buffer type
@@ -114,6 +114,9 @@ extern "C" {
         void (*event_record)(wsp_ggml_backend_t backend, wsp_ggml_backend_event_t event);
         // wait for an event on on a different stream
         void (*event_wait)  (wsp_ggml_backend_t backend, wsp_ggml_backend_event_t event);
+        // (optional) sort/optimize the nodes in the graph
+        void                      (*graph_optimize)    (wsp_ggml_backend_t backend, struct wsp_ggml_cgraph * cgraph);
     };
     struct wsp_ggml_backend {

package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/ggml-backend.h CHANGED Viewed

@@ -132,6 +132,8 @@ extern "C" {
         WSP_GGML_BACKEND_DEVICE_TYPE_CPU,
         // GPU device using dedicated memory
         WSP_GGML_BACKEND_DEVICE_TYPE_GPU,
+        // integrated GPU device using host memory
+        WSP_GGML_BACKEND_DEVICE_TYPE_IGPU,
         // accelerator devices intended to be used together with the CPU backend (e.g. BLAS or AMX)
         WSP_GGML_BACKEND_DEVICE_TYPE_ACCEL
     };
@@ -150,11 +152,21 @@ extern "C" {
     // all the device properties
     struct wsp_ggml_backend_dev_props {
+        // device name
         const char * name;
+        // device description
         const char * description;
+        // device free memory in bytes
         size_t memory_free;
+        // device total memory in bytes
         size_t memory_total;
+        // device type
         enum wsp_ggml_backend_dev_type type;
+        // device id
+        //   for PCI devices, this should be the PCI bus id formatted as "domain:bus:device.function" (e.g. "0000:01:00.0")
+        //   if the id is unknown, this should be NULL
+        const char * device_id;
+        // device capabilities
         struct wsp_ggml_backend_dev_caps caps;
     };
@@ -302,11 +314,15 @@ extern "C" {
     WSP_GGML_API int                  wsp_ggml_backend_sched_get_n_splits(wsp_ggml_backend_sched_t sched);
     WSP_GGML_API int                  wsp_ggml_backend_sched_get_n_copies(wsp_ggml_backend_sched_t sched);
-    WSP_GGML_API size_t               wsp_ggml_backend_sched_get_buffer_size(wsp_ggml_backend_sched_t sched, wsp_ggml_backend_t backend);
+    WSP_GGML_API wsp_ggml_backend_buffer_type_t wsp_ggml_backend_sched_get_buffer_type(wsp_ggml_backend_sched_t sched, wsp_ggml_backend_t backend);
+    WSP_GGML_API size_t                     wsp_ggml_backend_sched_get_buffer_size(wsp_ggml_backend_sched_t sched, wsp_ggml_backend_t backend);
     WSP_GGML_API void                 wsp_ggml_backend_sched_set_tensor_backend(wsp_ggml_backend_sched_t sched, struct wsp_ggml_tensor * node, wsp_ggml_backend_t backend);
     WSP_GGML_API wsp_ggml_backend_t       wsp_ggml_backend_sched_get_tensor_backend(wsp_ggml_backend_sched_t sched, struct wsp_ggml_tensor * node);
+    // Split graph without allocating it
+    WSP_GGML_API void                 wsp_ggml_backend_sched_split_graph(wsp_ggml_backend_sched_t sched, struct wsp_ggml_cgraph * graph);
     // Allocate and compute graph on the backend scheduler
     WSP_GGML_API bool                 wsp_ggml_backend_sched_alloc_graph(wsp_ggml_backend_sched_t sched, struct wsp_ggml_cgraph * graph); // returns success
     WSP_GGML_API enum wsp_ggml_status     wsp_ggml_backend_sched_graph_compute(wsp_ggml_backend_sched_t sched, struct wsp_ggml_cgraph * graph);

package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/ggml-cpu.h CHANGED Viewed

@@ -101,7 +101,6 @@ extern "C" {
     WSP_GGML_BACKEND_API int wsp_ggml_cpu_has_riscv_v    (void);
     WSP_GGML_BACKEND_API int wsp_ggml_cpu_has_vsx        (void);
     WSP_GGML_BACKEND_API int wsp_ggml_cpu_has_vxe        (void);
-    WSP_GGML_BACKEND_API int wsp_ggml_cpu_has_nnpa       (void);
     WSP_GGML_BACKEND_API int wsp_ggml_cpu_has_wasm_simd  (void);
     WSP_GGML_BACKEND_API int wsp_ggml_cpu_has_llamafile  (void);
@@ -135,6 +134,7 @@ extern "C" {
     WSP_GGML_BACKEND_API wsp_ggml_backend_reg_t wsp_ggml_backend_cpu_reg(void);
     WSP_GGML_BACKEND_API void wsp_ggml_cpu_fp32_to_fp32(const float *,       float *, int64_t);
+    WSP_GGML_BACKEND_API void wsp_ggml_cpu_fp32_to_i32 (const float *,     int32_t *, int64_t);
     WSP_GGML_BACKEND_API void wsp_ggml_cpu_fp32_to_fp16(const float *, wsp_ggml_fp16_t *, int64_t);
     WSP_GGML_BACKEND_API void wsp_ggml_cpu_fp16_to_fp32(const wsp_ggml_fp16_t *, float *, int64_t);
     WSP_GGML_BACKEND_API void wsp_ggml_cpu_fp32_to_bf16(const float *, wsp_ggml_bf16_t *, int64_t);