npm - whisper.rn - Versions diffs - 0.3.9 → 0.4.0-rc.0 - Mend

whisper.rn 0.3.9 → 0.4.0-rc.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (15) hide show

package/android/src/main/CMakeLists.txt +2 -1
package/android/src/main/jni.cpp +7 -1
package/cpp/coreml/whisper-encoder.mm +7 -1
package/cpp/ggml-alloc.c +633 -0
package/cpp/ggml-alloc.h +26 -0
package/cpp/ggml-metal.h +85 -0
package/cpp/ggml-metal.m +1283 -0
package/cpp/ggml-metal.metal +2353 -0
package/cpp/ggml.c +5024 -2924
package/cpp/ggml.h +569 -95
package/cpp/whisper.cpp +993 -668
package/cpp/whisper.h +10 -0
package/ios/RNWhisperContext.mm +9 -3
package/package.json +1 -1
package/whisper-rn.podspec +8 -2

package/cpp/whisper.cpp CHANGED Viewed

@@ -3,11 +3,16 @@
 #include "coreml/whisper-encoder.h"
 #endif
-#if WHISPER_USE_OPENVINO
+#ifdef WSP_GGML_USE_METAL
+#  include "ggml-metal.h"
+#endif
+#ifdef WHISPER_USE_OPENVINO
 #include "openvino/whisper-openvino-encoder.h"
 #endif
 #include "ggml.h"
+#include "ggml-alloc.h"
 #include <algorithm>
 #include <cassert>
@@ -18,11 +23,13 @@
 #include <cstring>
 #include <fstream>
 #include <map>
+#include <set>
 #include <string>
 #include <thread>
 #include <vector>
 #include <regex>
 #include <random>
+#include <functional>
 #if defined(_MSC_VER)
 #pragma warning(disable: 4244 4267) // possible loss of data
@@ -114,8 +121,66 @@ static void byteswap_tensor(wsp_ggml_tensor * tensor) {
 //#define WHISPER_USE_FLASH_FF
 #define WHISPER_MAX_DECODERS 16
-#define WHISPER_USE_SCRATCH
-#define WHISPER_MAX_SCRATCH_BUFFERS 16
+//
+// ggml helpers
+//
+static void wsp_ggml_graph_compute_helper(
+        std::vector<uint8_t> & buf,
+                 wsp_ggml_cgraph * graph,
+                         int   n_threads,
+      whisper_abort_callback   abort_callback,
+                        void * abort_callback_data) {
+    struct wsp_ggml_cplan plan = wsp_ggml_graph_plan(graph, n_threads);
+    plan.abort_callback = abort_callback;
+    plan.abort_callback_data = abort_callback_data;
+    if (plan.work_size > 0) {
+        buf.resize(plan.work_size);
+        plan.work_data = buf.data();
+    }
+    wsp_ggml_graph_compute(graph, &plan);
+}
+// faster matrix multiplications for tensors that do not have dimension 0 divisible by "pad"
+// the idea is to represent the original matrix multiplication:
+//
+//   Z = X @ Y
+//
+// with the sum of two matrix multiplications:
+//
+//   Z = (X_0 @ Y_0) + (X_1 @ Y_1)
+//
+// here X_0 and Y_0 are views of X and Y that have dimension 0 divisible by "pad"
+// and X_1 and Y_1 are the remaining views. X_1 and Y_1 end up being small matrices that can be processed with more
+// general-purpose kernels
+//
+static struct wsp_ggml_tensor * wsp_ggml_mul_mat_pad(struct wsp_ggml_context * ctx, struct wsp_ggml_tensor * x, struct wsp_ggml_tensor * y, int pad = 32) {
+    // use padding only if dimension 0 is at least 8 times larger than the padding
+    // else we won't get much benefit from the optimization
+    const int n_pad_req = 8;
+    if (x->ne[0] % pad == 0 || x->ne[0] / pad < n_pad_req) {
+        return wsp_ggml_mul_mat(ctx, x, y);
+    }
+    struct wsp_ggml_tensor * x_0 = wsp_ggml_view_3d(ctx, x, (x->ne[0]/pad)*pad, x->ne[1], x->ne[2], x->nb[1], x->nb[2], 0);
+    struct wsp_ggml_tensor * x_1 = wsp_ggml_view_3d(ctx, x,  x->ne[0]%pad,      x->ne[1], x->ne[2], x->nb[1], x->nb[2], x_0->ne[0]*x_0->nb[0]);
+    struct wsp_ggml_tensor * y_0 = wsp_ggml_view_3d(ctx, y, (y->ne[0]/pad)*pad, y->ne[1], y->ne[2], y->nb[1], y->nb[2], 0);
+    struct wsp_ggml_tensor * y_1 = wsp_ggml_view_3d(ctx, y,  y->ne[0]%pad,      y->ne[1], y->ne[2], y->nb[1], y->nb[2], y_0->ne[0]*y_0->nb[0]);
+    return wsp_ggml_add(ctx,
+            wsp_ggml_mul_mat(ctx, x_0, y_0),
+            wsp_ggml_mul_mat(ctx, x_1, y_1));
+}
+// TODO: check if other platforms can benefit from this optimization
+#if defined(WSP_GGML_USE_METAL)
+#define wsp_ggml_mul_mat wsp_ggml_mul_mat_pad
+#endif
 // available whisper models
 enum e_model {
@@ -231,38 +296,7 @@ static const std::map<std::string, std::pair<int, std::string>> g_lang = {
 static const size_t MB = 1ull*1024*1024;
-static const std::map<e_model, size_t> MEM_REQ_SCRATCH0 = {
-    { MODEL_TINY,     62ull*MB },
-    { MODEL_BASE,     80ull*MB },
-    { MODEL_SMALL,   120ull*MB },
-    { MODEL_MEDIUM,  158ull*MB },
-    { MODEL_LARGE,   198ull*MB },
-};
-static const std::map<e_model, size_t> MEM_REQ_SCRATCH1 = {
-    { MODEL_TINY,     18ull*MB },
-    { MODEL_BASE,     24ull*MB },
-    { MODEL_SMALL,    36ull*MB },
-    { MODEL_MEDIUM,   48ull*MB },
-    { MODEL_LARGE,    60ull*MB },
-};
-static const std::map<e_model, size_t> MEM_REQ_SCRATCH2 = {
-    { MODEL_TINY,      4ull*MB },
-    { MODEL_BASE,      4ull*MB },
-    { MODEL_SMALL,     6ull*MB },
-    { MODEL_MEDIUM,    7ull*MB },
-    { MODEL_LARGE,     9ull*MB },
-};
-static const std::map<e_model, size_t> MEM_REQ_SCRATCH3 = {
-    { MODEL_TINY,      4ull*MB },
-    { MODEL_BASE,      4ull*MB },
-    { MODEL_SMALL,     6ull*MB },
-    { MODEL_MEDIUM,    7ull*MB },
-    { MODEL_LARGE,     9ull*MB },
-};
+// TODO: avoid using GGUF
 static const std::map<wsp_ggml_type, std::map<e_model, size_t>> MEM_REQ_MODEL = {
     { WSP_GGML_TYPE_F32,
         {
@@ -329,38 +363,6 @@ static const std::map<wsp_ggml_type, std::map<e_model, size_t>> MEM_REQ_MODEL =
     },
 };
-static const std::map<e_model, size_t> MEM_REQ_KV_SELF = {
-    { MODEL_TINY,      3ull*MB },
-    { MODEL_BASE,      6ull*MB },
-    { MODEL_SMALL,    16ull*MB },
-    { MODEL_MEDIUM,   43ull*MB },
-    { MODEL_LARGE,    71ull*MB },
-};
-static const std::map<e_model, size_t> MEM_REQ_KV_CROSS = {
-    { MODEL_TINY,      9ull*MB },
-    { MODEL_BASE,     18ull*MB },
-    { MODEL_SMALL,    53ull*MB },
-    { MODEL_MEDIUM,  141ull*MB },
-    { MODEL_LARGE,   235ull*MB },
-};
-static const std::map<e_model, size_t> MEM_REQ_ENCODE = {
-    { MODEL_TINY,     30ull*MB },
-    { MODEL_BASE,     38ull*MB },
-    { MODEL_SMALL,    56ull*MB },
-    { MODEL_MEDIUM,   74ull*MB },
-    { MODEL_LARGE,    94ull*MB },
-};
-static const std::map<e_model, size_t> MEM_REQ_DECODE = {
-    { MODEL_TINY,      3ull*MB },
-    { MODEL_BASE,      5ull*MB },
-    { MODEL_SMALL,    10ull*MB },
-    { MODEL_MEDIUM,   18ull*MB },
-    { MODEL_LARGE,    27ull*MB },
-};
 struct whisper_mel {
     int n_len;
     int n_len_org;
@@ -441,6 +443,7 @@ struct whisper_hparams {
     int32_t n_text_layer  = 4;
     int32_t n_mels        = 80;
     int32_t ftype         = 1;
+    float   eps           = 1e-5f;
 };
 // audio encoding layer
@@ -536,6 +539,7 @@ struct whisper_kv_cache {
     struct wsp_ggml_context * ctx;
+    // buf points to the memory allocated for both wsp_ggml_tensor 'k' and 'v' (see kv_cache_init)
     std::vector<uint8_t> buf;
     int n; // number of tokens currently in the cache
@@ -601,7 +605,7 @@ struct whisper_sequence {
 // TAGS: WHISPER_DECODER_INIT
 struct whisper_decoder {
-    // each decoders keeps its own KV-cache
+    // each decoder keeps its own KV-cache
     whisper_kv_cache kv_self;
     // the currently generated sequence of tokens
@@ -621,15 +625,75 @@ struct whisper_decoder {
     std::vector<whisper_token> tokens_tmp; // used for whisper_decode calls
 };
+// replace std::pair by using customized pair struct (reason: std::pair is very slow)
+template<typename A, typename B>
+struct whisper_pair {
+    A first;
+    B second;
+    // Define a constructor that takes two arguments.
+    whisper_pair(const A& a, const B& b) : first(a), second(b) {}
+    // Define a constructor that takes no argument.
+    whisper_pair() : first(A()), second(B()) {}
+};
+// beam-search helpers
+struct kv_buf {
+    std::vector<uint8_t> k;
+    std::vector<uint8_t> v;
+};
+// wsp_ggml_allocr wrapper for whisper usage
+struct whisper_allocr {
+    wsp_ggml_allocr * alloc = nullptr;
+    std::vector<uint8_t> meta;
+    std::vector<uint8_t> data;
+};
+static size_t whisper_allocr_size(struct whisper_allocr & allocr) {
+    return allocr.meta.size() + allocr.data.size();
+}
+// measure the memory usage of a graph and prepare the allocr's internal data buffer
+static void whisper_allocr_graph_init(struct whisper_allocr & allocr, std::function<struct wsp_ggml_cgraph *()> && get_graph) {
+    const int tensor_alignment = 32;
+    auto & alloc = allocr.alloc;
+    auto & meta  = allocr.meta;
+    auto & data  = allocr.data;
+    meta.resize(wsp_ggml_tensor_overhead()*WSP_GGML_MAX_NODES + wsp_ggml_graph_overhead());
+    alloc = wsp_ggml_allocr_new_measure(tensor_alignment);
+    const size_t alloc_size = wsp_ggml_allocr_alloc_graph(alloc, get_graph()) + tensor_alignment;
+    wsp_ggml_allocr_free(alloc);
+    data.resize(alloc_size);
+    alloc = wsp_ggml_allocr_new(data.data(), data.size(), tensor_alignment);
+}
+static void whisper_allocr_free(struct whisper_allocr & allocr) {
+    if (allocr.alloc) {
+        wsp_ggml_allocr_free(allocr.alloc);
+        allocr.alloc = nullptr;
+    }
+}
 struct whisper_state {
     int64_t t_sample_us = 0;
     int64_t t_encode_us = 0;
     int64_t t_decode_us = 0;
+    int64_t t_prompt_us = 0;
     int64_t t_mel_us = 0;
     int32_t n_sample = 0; // number of tokens sampled
     int32_t n_encode = 0; // number of encoder calls
-    int32_t n_decode = 0; // number of decoder calls
+    int32_t n_decode = 0; // number of decoder calls with n_tokens == 1 (text-generation)
+    int32_t n_prompt = 0; // number of decoder calls with n_tokens >  1 (prompt encoding)
     int32_t n_fail_p = 0; // number of logprob threshold failures
     int32_t n_fail_h = 0; // number of entropy threshold failures
@@ -640,12 +704,23 @@ struct whisper_state {
     whisper_decoder decoders[WHISPER_MAX_DECODERS] = {};
-    // memory buffers used by encode / decode contexts
-    std::vector<uint8_t> buf_compute;
-    std::vector<uint8_t> buf_scratch[WHISPER_MAX_SCRATCH_BUFFERS];
+    // buffer for swapping KV caches between decoders during beam-search
+    std::vector<kv_buf> kv_swap_bufs;
+    // reusable buffer for `struct wsp_ggml_graph_plan.work_data`
+    std::vector<uint8_t> work_buffer;
-    int    buf_last = 0;
-    size_t buf_max_size[WHISPER_MAX_SCRATCH_BUFFERS] = { 0 };
+    // ggml-alloc:
+    // - stores meta info about the intermediate tensors into the `meta` buffers
+    // - stores the actual tensor data into the `data` buffers
+    whisper_allocr alloc_conv;
+    whisper_allocr alloc_encode;
+    whisper_allocr alloc_cross;
+    whisper_allocr alloc_decode;
+    // result of the encoder
+    struct wsp_ggml_tensor * embd_conv = nullptr;
+    struct wsp_ggml_tensor * embd_enc  = nullptr;
     // decode output (2-dimensional array: [n_tokens][n_vocab])
     std::vector<float> logits;
@@ -654,7 +729,7 @@ struct whisper_state {
     std::vector<whisper_token>   prompt_past;
     // work container used to avoid memory allocations
-    std::vector<std::pair<double, whisper_vocab::id>> logits_id;
+    std::vector<whisper_pair<double, whisper_vocab::id>> logits_id;
     mutable std::mt19937 rng; // used for sampling at t > 0.0
@@ -665,6 +740,10 @@ struct whisper_state {
     whisper_coreml_context * ctx_coreml = nullptr;
 #endif
+#ifdef WSP_GGML_USE_METAL
+    wsp_ggml_metal_context * ctx_metal = nullptr;
+#endif
 #ifdef WHISPER_USE_OPENVINO
     whisper_openvino_context * ctx_openvino = nullptr;
 #endif
@@ -677,37 +756,6 @@ struct whisper_state {
     // [EXPERIMENTAL] speed-up techniques
     int32_t exp_n_audio_ctx = 0; // 0 - use default
-    void use_buf(struct wsp_ggml_context * ctx, int i) {
-#if defined(WHISPER_USE_SCRATCH)
-        size_t last_size = 0;
-        if (i == -1) {
-            last_size = wsp_ggml_set_scratch(ctx, { 0, 0, nullptr, });
-        } else {
-            auto & buf = buf_scratch[i];
-            last_size = wsp_ggml_set_scratch(ctx, { 0, buf.size(), buf.data(), });
-        }
-        if (buf_last >= 0) {
-            buf_max_size[buf_last] = std::max(buf_max_size[buf_last], last_size);
-        }
-        buf_last = i;
-#else
-        (void) i;
-        (void) ctx;
-#endif
-    }
-    size_t get_buf_max_mem(int i) const {
-#if defined(WHISPER_USE_SCRATCH)
-        return buf_max_size[i];
-#else
-        (void) i;
-        return 0;
-#endif
-    }
 };
 struct whisper_context {
@@ -730,6 +778,13 @@ static void whisper_default_log(const char * text) {
 static whisper_log_callback whisper_log = whisper_default_log;
+#ifdef __GNUC__
+#ifdef __MINGW32__
+__attribute__((gnu_format(printf, 1, 2)))
+#else
+__attribute__((format(printf, 1, 2)))
+#endif
+#endif
 static void log(const char * fmt, ...) {
     if (!whisper_log) return;
     char buf[1024];
@@ -747,10 +802,17 @@ static void read_safe(whisper_model_loader * loader, T & dest) {
 static bool kv_cache_init(
         const struct whisper_hparams & hparams,
-                        const size_t   mem_bytes,
              struct whisper_kv_cache & cache,
                            wsp_ggml_type   wtype,
                                  int   n_ctx) {
+    const int64_t n_text_state = hparams.n_text_state;
+    const int64_t n_text_layer = hparams.n_text_layer;
+    const int64_t n_mem      = n_text_layer*n_ctx;
+    const int64_t n_elements = n_text_state*n_mem;
+    const size_t mem_bytes = 2*(wsp_ggml_type_size(wtype)*n_elements + wsp_ggml_tensor_overhead());
     cache.buf.resize(mem_bytes);
     struct wsp_ggml_init_params params = {
@@ -766,12 +828,6 @@ static bool kv_cache_init(
         return false;
     }
-    const int n_text_state = hparams.n_text_state;
-    const int n_text_layer = hparams.n_text_layer;
-    const int n_mem      = n_text_layer*n_ctx;
-    const int n_elements = n_text_state*n_mem;
     cache.k = wsp_ggml_new_tensor_1d(cache.ctx, wtype, n_elements);
     cache.v = wsp_ggml_new_tensor_1d(cache.ctx, wtype, n_elements);
@@ -914,22 +970,9 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
         // print memory requirements
         {
-            // this is the total memory required to run the inference
-            const size_t mem_required =
-                     MEM_REQ_SCRATCH0.at(model.type) +
-                     MEM_REQ_SCRATCH1.at(model.type) +
-                     MEM_REQ_SCRATCH2.at(model.type) +
-                     MEM_REQ_SCRATCH3.at(model.type) +
-                scale*MEM_REQ_MODEL.at(wctx.wtype).at(model.type) +
-                scale*MEM_REQ_KV_CROSS.at(model.type) +
-                scale*std::max(MEM_REQ_ENCODE.at(model.type), MEM_REQ_DECODE.at(model.type));
-            // this is the memory required by one decoder
-            const size_t mem_required_decoder =
-                scale*MEM_REQ_KV_SELF.at(model.type);
-            log("%s: mem required  = %7.2f MB (+ %7.2f MB per decoder)\n", __func__,
-                    mem_required / 1024.0 / 1024.0, mem_required_decoder / 1024.0 / 1024.0);
+            // TODO
+            //log("%s: mem required  = %7.2f MB (+ %7.2f MB per decoder)\n", __func__,
+            //        mem_required / 1024.0 / 1024.0, mem_required_decoder / 1024.0 / 1024.0);
         }
         // initialize all memory buffers
@@ -1438,49 +1481,56 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
     return true;
 }
-// evaluate the encoder with the given state
-//
-// given audio recording (more specifically, its log mel spectrogram), runs forward pass of the encoder
-// part of the transformer model and returns the encoded features
-//
-//   - wctx:      the model
-//   - wstate:     the state of the encoder
-//   - n_threads:  number of threads to use
-//   - mel_offset: offset in the mel spectrogram (i.e. audio offset)
-//
-static bool whisper_encode_internal(
-        whisper_context & wctx,
-          whisper_state & wstate,
-              const int   mel_offset,
-              const int   n_threads){
+static bool whisper_encode_external(const whisper_state & wstate) {
+    WSP_GGML_UNUSED(wstate);
-    const int64_t t_start_us = wsp_ggml_time_us();
+#ifndef WHISPER_USE_COREML
+    const bool use_coreml = false;
+#else
+    const bool use_coreml = wstate.ctx_coreml != nullptr;
+#endif
+#ifndef WHISPER_USE_OPENVINO
+    const bool use_openvino = false;
+#else
+    const bool use_openvino = wstate.ctx_openvino != nullptr;
+#endif
+    return use_coreml || use_openvino;
+}
+static struct wsp_ggml_cgraph * whisper_build_graph_conv(
+        whisper_context & wctx,
+          whisper_state & wstate,
+              const int   mel_offset) {
     const auto & model   = wctx.model;
     const auto & mel_inp = wstate.mel;
     const auto & hparams = model.hparams;
     const int n_ctx   = wstate.exp_n_audio_ctx > 0 ? wstate.exp_n_audio_ctx : hparams.n_audio_ctx;
-    const int n_state = hparams.n_audio_state;
-    const int n_head  = hparams.n_audio_head;
-    const int n_layer = hparams.n_audio_layer;
+    const int n_state = hparams.n_audio_state; WSP_GGML_UNUSED(n_state);
     const int n_mels = hparams.n_mels;
-    assert(mel_inp.n_mel == n_mels);
     struct wsp_ggml_init_params params = {
-        /*.mem_size   =*/ wstate.buf_compute.size(),
-        /*.mem_buffer =*/ wstate.buf_compute.data(),
-        /*.no_alloc   =*/ false,
+        /*.mem_size   =*/ wstate.alloc_conv.meta.size(),
+        /*.mem_buffer =*/ wstate.alloc_conv.meta.data(),
+        /*.no_alloc   =*/ true,
     };
     struct wsp_ggml_context * ctx0 = wsp_ggml_init(params);
-    wstate.use_buf(ctx0, 0);
+    wsp_ggml_cgraph * gf = wsp_ggml_new_graph(ctx0);
+    wsp_ggml_allocr * alloc = wstate.alloc_conv.alloc;
     struct wsp_ggml_tensor * mel = wsp_ggml_new_tensor_2d(ctx0, WSP_GGML_TYPE_F32, 2*n_ctx, n_mels);
+    wsp_ggml_allocr_alloc(alloc, mel);
     assert(mel->type == WSP_GGML_TYPE_F32);
-    {
+    if (!wsp_ggml_allocr_is_measure(alloc)) {
+        assert(mel_inp.n_mel == n_mels);
         float * dst = (float *) mel->data;
         memset(dst, 0, wsp_ggml_nbytes(mel));
@@ -1494,25 +1544,11 @@ static bool whisper_encode_internal(
         }
     }
-    struct wsp_ggml_tensor * cur;
+    struct wsp_ggml_tensor * cur = nullptr;
-#ifndef WHISPER_USE_COREML
-    const bool use_coreml = false;
-#else
-    const bool use_coreml = wstate.ctx_coreml != nullptr;
-#endif
-#ifndef WHISPER_USE_OPENVINO
-    const bool use_openvino = false;
-#else
-    const bool use_openvino = wstate.ctx_openvino != nullptr;
-#endif
-    if (!use_coreml && !use_openvino) {
+    if (!whisper_encode_external(wstate)) {
         // convolution + gelu
         {
-            wstate.use_buf(ctx0, 1);
             cur = wsp_ggml_conv_1d_ph(ctx0, model.e_conv_1_w, mel, 1, 1);
             cur = wsp_ggml_add(ctx0,
                     wsp_ggml_repeat(ctx0,
@@ -1522,8 +1558,6 @@ static bool whisper_encode_internal(
             cur = wsp_ggml_gelu(ctx0, cur);
-            wstate.use_buf(ctx0, 0);
             cur = wsp_ggml_conv_1d_ph(ctx0, model.e_conv_2_w, cur, 2, 1);
             cur = wsp_ggml_add(ctx0,
                     wsp_ggml_repeat(ctx0,
@@ -1534,373 +1568,433 @@ static bool whisper_encode_internal(
             cur = wsp_ggml_gelu(ctx0, cur);
         }
-        wstate.use_buf(ctx0, 3);
+        wstate.embd_conv = cur;
+    } else {
+#ifdef WHISPER_USE_COREML
+        cur = wsp_ggml_new_tensor_2d(ctx0, WSP_GGML_TYPE_F32, n_state, n_ctx);
+        wsp_ggml_allocr_alloc(alloc, cur);
-        // ===================================================================
-        // NOTE: experimenting with partial evaluation of the encoder (ignore)
-        //static int iter = -1;
-        //const int n_iter = 1500/n_ctx;
+        if (!wsp_ggml_allocr_is_measure(alloc)) {
+            whisper_coreml_encode(wstate.ctx_coreml, (float *) mel->data, (float *) cur->data);
+        }
+#endif
+#ifdef WHISPER_USE_OPENVINO
+        cur = wsp_ggml_new_tensor_2d(ctx0, WSP_GGML_TYPE_F32, n_state, n_ctx);
+        wsp_ggml_allocr_alloc(alloc, cur);
-        //iter = (iter + 1) % n_iter;
+        if (!wsp_ggml_allocr_is_measure(alloc)) {
+            whisper_openvino_encode(wstate.ctx_openvino, mel, cur);
+        }
+#endif
-        //if (iter == 0) {
-        //    memset(model.memory_cross_k->data, 0, wsp_ggml_nbytes(model.memory_cross_k));
-        //    memset(model.memory_cross_v->data, 0, wsp_ggml_nbytes(model.memory_cross_v));
-        //}
+        wstate.embd_enc = cur;
+    }
-        static int iter = 0;
+    wsp_ggml_build_forward_expand(gf, cur);
-        const size_t e_pe_stride = model.e_pe->ne[0]*wsp_ggml_element_size(model.e_pe);
-        const size_t e_pe_offset = model.e_pe->ne[0]*wsp_ggml_element_size(model.e_pe)*n_ctx*iter;
+    wsp_ggml_free(ctx0);
-        struct wsp_ggml_tensor * e_pe = wsp_ggml_view_2d(ctx0, model.e_pe, model.e_pe->ne[0], n_ctx, e_pe_stride, e_pe_offset);
+    return gf;
+}
-        cur = wsp_ggml_add(ctx0, e_pe, wsp_ggml_transpose(ctx0, cur));
+static struct wsp_ggml_cgraph * whisper_build_graph_encoder(
+        whisper_context & wctx,
+          whisper_state & wstate) {
+    const auto & model   = wctx.model;
+    const auto & hparams = model.hparams;
-        // ===================================================================
+    const int n_ctx   = wstate.exp_n_audio_ctx > 0 ? wstate.exp_n_audio_ctx : hparams.n_audio_ctx;
+    const int n_state = hparams.n_audio_state;
+    const int n_head  = hparams.n_audio_head;
+    const int n_layer = hparams.n_audio_layer;
-        // original:
-        //cur = wsp_ggml_add(ctx0, model.e_pe, wsp_ggml_transpose(ctx0, cur));
+    struct wsp_ggml_init_params params = {
+        /*.mem_size   =*/ wstate.alloc_encode.meta.size(),
+        /*.mem_buffer =*/ wstate.alloc_encode.meta.data(),
+        /*.no_alloc   =*/ true,
+    };
-        struct wsp_ggml_tensor * inpL = cur;
+    struct wsp_ggml_context * ctx0 = wsp_ggml_init(params);
-        for (int il = 0; il < n_layer; ++il) {
-            const auto & layer = model.layers_encoder[il];
+    wsp_ggml_cgraph * gf = wsp_ggml_new_graph(ctx0);
-            // norm
-            {
-                wstate.use_buf(ctx0, 0);
+    wsp_ggml_allocr * alloc = wstate.alloc_encode.alloc;
-                cur = wsp_ggml_norm(ctx0, inpL);
+    struct wsp_ggml_tensor * KQscale = wsp_ggml_new_tensor_1d(ctx0, WSP_GGML_TYPE_F32, 1);
+    wsp_ggml_allocr_alloc(alloc, KQscale);
-                // cur = ln_0_w*cur + ln_0_b
-                cur = wsp_ggml_add(ctx0,
-                        wsp_ggml_mul(ctx0,
-                            wsp_ggml_repeat(ctx0, layer.attn_ln_0_w, cur),
-                            cur),
-                        wsp_ggml_repeat(ctx0, layer.attn_ln_0_b, cur));
-            }
+    if (!wsp_ggml_allocr_is_measure(alloc)) {
+        wsp_ggml_set_f32(KQscale, 1.0f/sqrt(float(n_state)/n_head));
+    }
-            // self-attention
-            {
-                wstate.use_buf(ctx0, 1);
+    struct wsp_ggml_tensor * cur = wsp_ggml_view_tensor(ctx0, wstate.embd_conv);
-                struct wsp_ggml_tensor * Qcur = wsp_ggml_mul_mat(ctx0,
-                        layer.attn_q_w,
-                        cur);
+    // ===================================================================
+    // NOTE: experimenting with partial evaluation of the encoder (ignore)
+    //static int iter = -1;
+    //const int n_iter = 1500/n_ctx;
-                Qcur = wsp_ggml_add(ctx0,
-                        wsp_ggml_repeat(ctx0,
-                            layer.attn_q_b,
-                            Qcur),
-                        Qcur);
+    //iter = (iter + 1) % n_iter;
-                //Qcur = wsp_ggml_scale_inplace(ctx0, Qcur, wsp_ggml_new_f32(ctx0, pow(float(n_state)/n_head, -0.25)));
+    //if (iter == 0) {
+    //    memset(model.memory_cross_k->data, 0, wsp_ggml_nbytes(model.memory_cross_k));
+    //    memset(model.memory_cross_v->data, 0, wsp_ggml_nbytes(model.memory_cross_v));
+    //}
-                // note: no bias for Key
-                struct wsp_ggml_tensor * Kcur = wsp_ggml_mul_mat(ctx0,
-                        layer.attn_k_w,
-                        cur);
+    static int iter = 0;
-                //Kcur = wsp_ggml_scale_inplace(ctx0, Kcur, wsp_ggml_new_f32(ctx0, pow(float(n_state)/n_head, -0.25)));
+    const size_t e_pe_stride = model.e_pe->ne[0]*wsp_ggml_element_size(model.e_pe);
+    const size_t e_pe_offset = model.e_pe->ne[0]*wsp_ggml_element_size(model.e_pe)*n_ctx*iter;
-                struct wsp_ggml_tensor * Vcur = wsp_ggml_mul_mat(ctx0,
-                        layer.attn_v_w,
-                        cur);
+    struct wsp_ggml_tensor * e_pe = wsp_ggml_view_2d(ctx0, model.e_pe, model.e_pe->ne[0], n_ctx, e_pe_stride, e_pe_offset);
-                Vcur = wsp_ggml_add(ctx0,
-                        wsp_ggml_repeat(ctx0,
-                            layer.attn_v_b,
-                            Vcur),
-                        Vcur);
+    cur = wsp_ggml_add(ctx0, e_pe, wsp_ggml_cont(ctx0, wsp_ggml_transpose(ctx0, cur)));
+    // ===================================================================
+    // original:
+    //cur = wsp_ggml_add(ctx0, model.e_pe, wsp_ggml_transpose(ctx0, cur));
+    struct wsp_ggml_tensor * inpL = cur;
-                // ------
+    for (int il = 0; il < n_layer; ++il) {
+        const auto & layer = model.layers_encoder[il];
+        // norm
+        {
+            cur = wsp_ggml_norm(ctx0, inpL, hparams.eps);
+            // cur = ln_0_w*cur + ln_0_b
+            cur = wsp_ggml_add(ctx0,
+                    wsp_ggml_mul(ctx0, cur, layer.attn_ln_0_w),
+                    layer.attn_ln_0_b);
+        }
+        // self-attention
+        {
+            struct wsp_ggml_tensor * Qcur = wsp_ggml_mul_mat(ctx0,
+                    layer.attn_q_w,
+                    cur);
+            Qcur = wsp_ggml_add(ctx0, Qcur, layer.attn_q_b);
+            //Qcur = wsp_ggml_scale(ctx0, Qcur, wsp_ggml_new_f32(ctx0, pow(float(n_state)/n_head, -0.25)));
+            // note: no bias for Key
+            struct wsp_ggml_tensor * Kcur = wsp_ggml_mul_mat(ctx0,
+                    layer.attn_k_w,
+                    cur);
+            //Kcur = wsp_ggml_scale(ctx0, Kcur, wsp_ggml_new_f32(ctx0, pow(float(n_state)/n_head, -0.25)));
+            struct wsp_ggml_tensor * Vcur = wsp_ggml_mul_mat(ctx0,
+                    layer.attn_v_w,
+                    cur);
+            Vcur = wsp_ggml_add(ctx0, Vcur, layer.attn_v_b);
-                wstate.use_buf(ctx0, 0);
+            // ------
 #ifdef WHISPER_USE_FLASH_ATTN
-                struct wsp_ggml_tensor * Q =
-                    wsp_ggml_permute(ctx0,
-                            wsp_ggml_cpy(ctx0,
-                                Qcur,
-                                wsp_ggml_new_tensor_3d(ctx0, wctx.itype, n_state/n_head, n_head, n_ctx)),
-                            0, 2, 1, 3);
-                struct wsp_ggml_tensor * K =
-                    wsp_ggml_permute(ctx0,
-                            wsp_ggml_cpy(ctx0,
-                                Kcur,
-                                wsp_ggml_new_tensor_3d(ctx0, wctx.itype, n_state/n_head, n_head, n_ctx)),
-                            0, 2, 1, 3);
-                struct wsp_ggml_tensor * V =
-                    wsp_ggml_cpy(ctx0,
-                            wsp_ggml_permute(ctx0,
-                                wsp_ggml_reshape_3d(ctx0,
-                                    Vcur,
-                                    n_state/n_head, n_head, n_ctx),
-                                1, 2, 0, 3),
-                            wsp_ggml_new_tensor_3d(ctx0, wctx.itype, n_ctx, n_state/n_head, n_head));
-                struct wsp_ggml_tensor * KQV = wsp_ggml_flash_attn(ctx0, Q, K, V, false);
+            struct wsp_ggml_tensor * Q =
+                wsp_ggml_permute(ctx0,
+                        wsp_ggml_cpy(ctx0,
+                            Qcur,
+                            wsp_ggml_new_tensor_3d(ctx0, wctx.itype, n_state/n_head, n_head, n_ctx)),
+                        0, 2, 1, 3);
+            struct wsp_ggml_tensor * K =
+                wsp_ggml_permute(ctx0,
+                        wsp_ggml_cpy(ctx0,
+                            Kcur,
+                            wsp_ggml_new_tensor_3d(ctx0, wctx.itype, n_state/n_head, n_head, n_ctx)),
+                        0, 2, 1, 3);
+            struct wsp_ggml_tensor * V =
+                wsp_ggml_cpy(ctx0,
+                        wsp_ggml_permute(ctx0,
+                            wsp_ggml_reshape_3d(ctx0,
+                                Vcur,
+                                n_state/n_head, n_head, n_ctx),
+                            1, 2, 0, 3),
+                        wsp_ggml_new_tensor_3d(ctx0, wctx.itype, n_ctx, n_state/n_head, n_head));
+            struct wsp_ggml_tensor * KQV = wsp_ggml_flash_attn(ctx0, Q, K, V, false);
 #else
-                struct wsp_ggml_tensor * Q =
-                    wsp_ggml_permute(ctx0,
-                            wsp_ggml_cpy(ctx0,
-                                Qcur,
-                                wsp_ggml_new_tensor_3d(ctx0, WSP_GGML_TYPE_F32, n_state/n_head, n_head, n_ctx)),
-                            0, 2, 1, 3);
-                struct wsp_ggml_tensor * K =
-                    wsp_ggml_permute(ctx0,
-                            wsp_ggml_cpy(ctx0,
-                                Kcur,
-                                wsp_ggml_new_tensor_3d(ctx0, wctx.itype, n_state/n_head, n_head, n_ctx)),
-                            0, 2, 1, 3);
-                // K * Q
-                struct wsp_ggml_tensor * KQ = wsp_ggml_mul_mat(ctx0, K, Q);
-                struct wsp_ggml_tensor * KQ_scaled =
-                    wsp_ggml_scale_inplace(ctx0,
-                            KQ,
-                            wsp_ggml_new_f32(ctx0, 1.0f/sqrt(float(n_state)/n_head))
-                            );
-                struct wsp_ggml_tensor * KQ_soft_max = wsp_ggml_soft_max_inplace(ctx0, KQ_scaled);
-                struct wsp_ggml_tensor * V =
-                    wsp_ggml_cpy(ctx0,
-                            wsp_ggml_permute(ctx0,
-                                wsp_ggml_reshape_3d(ctx0,
-                                    Vcur,
-                                    n_state/n_head, n_head, n_ctx),
-                                1, 2, 0, 3),
-                            wsp_ggml_new_tensor_3d(ctx0, wctx.itype, n_ctx, n_state/n_head, n_head)
-                            );
-                struct wsp_ggml_tensor * KQV = wsp_ggml_mul_mat(ctx0, V, KQ_soft_max);
+            struct wsp_ggml_tensor * Q =
+                wsp_ggml_permute(ctx0,
+                        wsp_ggml_cpy(ctx0,
+                            Qcur,
+                            wsp_ggml_new_tensor_3d(ctx0, WSP_GGML_TYPE_F32, n_state/n_head, n_head, n_ctx)),
+                        0, 2, 1, 3);
+            struct wsp_ggml_tensor * K =
+                wsp_ggml_permute(ctx0,
+                        wsp_ggml_cpy(ctx0,
+                            Kcur,
+                            wsp_ggml_new_tensor_3d(ctx0, wctx.itype, n_state/n_head, n_head, n_ctx)),
+                        0, 2, 1, 3);
+            // K * Q
+            struct wsp_ggml_tensor * KQ = wsp_ggml_mul_mat(ctx0, K, Q);
+            struct wsp_ggml_tensor * KQ_scaled = wsp_ggml_scale(ctx0, KQ, KQscale);
+            struct wsp_ggml_tensor * KQ_soft_max = wsp_ggml_soft_max(ctx0, KQ_scaled);
+            struct wsp_ggml_tensor * V =
+                wsp_ggml_cpy(ctx0,
+                        wsp_ggml_permute(ctx0,
+                            wsp_ggml_reshape_3d(ctx0,
+                                Vcur,
+                                n_state/n_head, n_head, n_ctx),
+                            1, 2, 0, 3),
+                        wsp_ggml_new_tensor_3d(ctx0, wctx.itype, n_ctx, n_state/n_head, n_head)
+                        );
+            struct wsp_ggml_tensor * KQV = wsp_ggml_mul_mat(ctx0, V, KQ_soft_max);
 #endif
-                struct wsp_ggml_tensor * KQV_merged = wsp_ggml_permute(ctx0, KQV, 0, 2, 1, 3);
+            struct wsp_ggml_tensor * KQV_merged = wsp_ggml_permute(ctx0, KQV, 0, 2, 1, 3);
-                wstate.use_buf(ctx0, 1);
+            cur = wsp_ggml_cpy(ctx0,
+                    KQV_merged,
+                    wsp_ggml_new_tensor_2d(ctx0, WSP_GGML_TYPE_F32, n_state, n_ctx));
+        }
-                cur = wsp_ggml_cpy(ctx0,
-                        KQV_merged,
-                        wsp_ggml_new_tensor_2d(ctx0, WSP_GGML_TYPE_F32, n_state, n_ctx));
-            }
+        // projection
+        {
+            cur = wsp_ggml_mul_mat(ctx0,
+                    layer.attn_ln_1_w,
+                    cur);
-            // projection
-            {
-                wstate.use_buf(ctx0, 0);
+            cur = wsp_ggml_add(ctx0, cur, layer.attn_ln_1_b);
+        }
-                cur = wsp_ggml_mul_mat(ctx0,
-                        layer.attn_ln_1_w,
-                        cur);
+        // add the input
+        cur = wsp_ggml_add(ctx0, cur, inpL);
-                wstate.use_buf(ctx0, 1);
+        struct wsp_ggml_tensor * inpFF = cur;
+        // feed-forward network
+        {
+            // norm
+            {
+                cur = wsp_ggml_norm(ctx0, inpFF, hparams.eps);
+                // cur = mlp_ln_w*cur + mlp_ln_b
                 cur = wsp_ggml_add(ctx0,
-                        wsp_ggml_repeat(ctx0, layer.attn_ln_1_b, cur),
-                        cur);
+                        wsp_ggml_mul(ctx0, cur, layer.mlp_ln_w),
+                        layer.mlp_ln_b);
             }
-            wstate.use_buf(ctx0, 2);
+#ifdef WHISPER_USE_FLASH_FF
+            cur = wsp_ggml_flash_ff(ctx0,
+                    wsp_ggml_cpy(ctx0, cur, wsp_ggml_new_tensor_2d(ctx0, wstate.itype, n_state, n_ctx)),
+                    layer.mlp_0_w, layer.mlp_0_b, layer.mlp_1_w, layer.mlp_1_b);
+#else
+            // fully connected
+            cur = wsp_ggml_mul_mat(ctx0,
+                    layer.mlp_0_w,
+                    cur);
-            // add the input
-            cur = wsp_ggml_add(ctx0, cur, inpL);
+            cur = wsp_ggml_add(ctx0, cur, layer.mlp_0_b);
-            struct wsp_ggml_tensor * inpFF = cur;
+            // GELU activation
+            cur = wsp_ggml_gelu(ctx0, cur);
-            // feed-forward network
-            {
-                // norm
-                {
-                    wstate.use_buf(ctx0, 0);
+            // projection
+            cur = wsp_ggml_mul_mat(ctx0,
+                    layer.mlp_1_w,
+                    cur);
-                    cur = wsp_ggml_norm(ctx0, inpFF);
+            cur = wsp_ggml_add(ctx0, cur, layer.mlp_1_b);
+#endif
+        }
-                    wstate.use_buf(ctx0, 1);
+        inpL = wsp_ggml_add(ctx0, cur, inpFF);
+    }
-                    // cur = mlp_ln_w*cur + mlp_ln_b
-                    cur = wsp_ggml_add(ctx0,
-                            wsp_ggml_mul(ctx0,
-                                wsp_ggml_repeat(ctx0, layer.mlp_ln_w, cur),
-                                cur),
-                            wsp_ggml_repeat(ctx0, layer.mlp_ln_b, cur));
-                }
+    cur = inpL;
-#ifdef WHISPER_USE_FLASH_FF
-                wstate.use_buf(ctx0, 0);
+    // norm
+    {
+        cur = wsp_ggml_norm(ctx0, cur, hparams.eps);
-                cur = wsp_ggml_flash_ff(ctx0,
-                        wsp_ggml_cpy(ctx0, cur, wsp_ggml_new_tensor_2d(ctx0, wstate.itype, n_state, n_ctx)),
-                        layer.mlp_0_w, layer.mlp_0_b, layer.mlp_1_w, layer.mlp_1_b);
-#else
-                wstate.use_buf(ctx0, 0);
+        // cur = ln_f_g*cur + ln_f_b
+        cur = wsp_ggml_add(ctx0,
+                wsp_ggml_mul(ctx0, cur, model.e_ln_w),
+                model.e_ln_b);
+    }
-                // fully connected
-                cur = wsp_ggml_mul_mat(ctx0,
-                        layer.mlp_0_w,
-                        cur);
+    wsp_ggml_build_forward_expand(gf, cur);
-                wstate.use_buf(ctx0, 1);
+    wstate.embd_enc = cur;
-                cur = wsp_ggml_add(ctx0,
-                        wsp_ggml_repeat(ctx0, layer.mlp_0_b, cur),
-                        cur);
+    //wsp_ggml_graph_print(gf);
-                wstate.use_buf(ctx0, 0);
+    ////////////////////////////////////////////////////////////////////////////
-                // GELU activation
-                cur = wsp_ggml_gelu(ctx0, cur);
+    //printf("%s: used_mem = %f MB, %f MB, %f MB %f MB %f MB\n", __func__,
+    //        wsp_ggml_used_mem(ctx0)/1024.0/1024.0,
+    //        wstate.get_buf_max_mem(0)/1024.0/1024.0,
+    //        wstate.get_buf_max_mem(1)/1024.0/1024.0,
+    //        wstate.get_buf_max_mem(2)/1024.0/1024.0,
+    //        wstate.get_buf_max_mem(3)/1024.0/1024.0);
-                wstate.use_buf(ctx0, 1);
+    wsp_ggml_free(ctx0);
-                // projection
-                cur = wsp_ggml_mul_mat(ctx0,
-                        layer.mlp_1_w,
-                        cur);
+    return gf;
+}
-                wstate.use_buf(ctx0, 0);
+// pre-compute cross-attention memory
+static struct wsp_ggml_cgraph * whisper_build_graph_cross(
+        whisper_context & wctx,
+          whisper_state & wstate) {
+    const auto & model   = wctx.model;
+    const auto & hparams = model.hparams;
-                cur = wsp_ggml_add(ctx0,
-                        wsp_ggml_repeat(ctx0, layer.mlp_1_b, cur),
-                        cur);
-#endif
-            }
+    const int n_ctx   = wstate.exp_n_audio_ctx > 0 ? wstate.exp_n_audio_ctx : hparams.n_audio_ctx;
+    const int n_state = hparams.n_audio_state;
+    const int n_head  = hparams.n_audio_head;
-            wstate.use_buf(ctx0, 3);
+    struct wsp_ggml_init_params params = {
+        /*.mem_size   =*/ wstate.alloc_cross.meta.size(),
+        /*.mem_buffer =*/ wstate.alloc_cross.meta.data(),
+        /*.no_alloc   =*/ true,
+    };
-            inpL = wsp_ggml_add(ctx0, cur, inpFF);
-        }
+    struct wsp_ggml_context * ctx0 = wsp_ggml_init(params);
-        cur = inpL;
+    wsp_ggml_cgraph * gf = wsp_ggml_new_graph(ctx0);
-        // norm
-        {
-            wstate.use_buf(ctx0, 0);
+    wsp_ggml_allocr * alloc = wstate.alloc_cross.alloc;
-            cur = wsp_ggml_norm(ctx0, cur);
+    struct wsp_ggml_tensor * cur = wsp_ggml_view_tensor(ctx0, wstate.embd_enc);
-            wstate.use_buf(ctx0, 1);
+    struct wsp_ggml_tensor * Kscale = wsp_ggml_new_tensor_1d(ctx0, WSP_GGML_TYPE_F32, 1);
+    wsp_ggml_allocr_alloc(alloc, Kscale);
-            // cur = ln_f_g*cur + ln_f_b
-            cur = wsp_ggml_add(ctx0,
-                    wsp_ggml_mul(ctx0,
-                        wsp_ggml_repeat(ctx0, model.e_ln_w, cur),
-                        cur),
-                    wsp_ggml_repeat(ctx0, model.e_ln_b, cur));
-        }
+    if (!wsp_ggml_allocr_is_measure(alloc)) {
+        wsp_ggml_set_f32(Kscale, pow(float(n_state) / n_head, -0.25));
+    }
-        wstate.use_buf(ctx0, -1);
+    for (int il = 0; il < model.hparams.n_text_layer; ++il) {
+        auto & layer = model.layers_decoder[il];
-        // run the computation
-        {
-            struct wsp_ggml_cgraph gf = {};
-            gf.n_threads = n_threads;
+        struct wsp_ggml_tensor* Kcross = wsp_ggml_mul_mat(ctx0,
+                layer.cross_attn_k_w,
+                cur);
-            wsp_ggml_build_forward_expand(&gf, cur);
-            wsp_ggml_graph_compute(ctx0, &gf);
+        Kcross = wsp_ggml_scale(ctx0, Kcross, Kscale);
-            //wsp_ggml_graph_print(&gf);
-        }
-    }
-#ifdef WHISPER_USE_COREML
-    else if (use_coreml) {
-        wstate.use_buf(ctx0, -1);
+        struct wsp_ggml_tensor* Vcross = wsp_ggml_mul_mat(ctx0,
+                layer.cross_attn_v_w,
+                cur);
-        cur = wsp_ggml_new_tensor_2d(ctx0, WSP_GGML_TYPE_F32, n_state, n_ctx);
+        Vcross = wsp_ggml_add(ctx0,
+                    Vcross,
+                    layer.cross_attn_v_b);
-        whisper_coreml_encode(wstate.ctx_coreml, (float *) mel->data, (float *) cur->data);
-    }
-#endif
-#ifdef WHISPER_USE_OPENVINO
-    else if (use_openvino) {
-        wstate.use_buf(ctx0, -1);
+        Vcross = wsp_ggml_transpose(ctx0, wsp_ggml_reshape_2d(ctx0, Vcross, n_state, n_ctx));
-        cur = wsp_ggml_new_tensor_2d(ctx0, WSP_GGML_TYPE_F32, n_state, n_ctx);
+        struct wsp_ggml_tensor * k = wsp_ggml_view_1d(ctx0, wstate.kv_cross.k,
+                n_state*n_ctx,
+                (wsp_ggml_element_size(wstate.kv_cross.k)*n_state)*(il*n_ctx));
-        if (!whisper_openvino_encode(wstate.ctx_openvino, mel, cur)) {
-            return false;
-        }
+        struct wsp_ggml_tensor * v = wsp_ggml_view_2d(ctx0, wstate.kv_cross.v, n_ctx, n_state,
+                (   n_ctx)*wsp_ggml_element_size(wstate.kv_cross.v),
+                (il*n_ctx)*wsp_ggml_element_size(wstate.kv_cross.v)*n_state);
+        wsp_ggml_build_forward_expand(gf, wsp_ggml_cpy(ctx0, Kcross, k));
+        wsp_ggml_build_forward_expand(gf, wsp_ggml_cpy(ctx0, Vcross, v));
     }
-#endif
-    // cur
-    //{
-    //    printf("ne0 = %d\n", cur->ne[0]);
-    //    printf("ne1 = %d\n", cur->ne[1]);
-    //    for (int i = 0; i < 10; ++i) {
-    //        printf("%8.4f ", ((float *)(cur->data))[i]);
-    //    }
-    //    printf("... ");
-    //    for (int i = cur->ne[0] - 10; i < cur->ne[0]; ++i) {
-    //        printf("%8.4f ", ((float *)(cur->data))[i]);
-    //    }
-    //    printf("\n");
-    //}
+    //wsp_ggml_graph_print(gf);
-    // pre-compute cross-attention memory
-    {
-        struct wsp_ggml_cgraph gf = {};
-        gf.n_threads = n_threads;
+    wsp_ggml_free(ctx0);
-        // TODO: hack to disconnect the encoded features from the previous graph
-        cur->op = WSP_GGML_OP_NONE;
-        cur->src0 = nullptr;
-        cur->src1 = nullptr;
+    return gf;
+}
-        for (int il = 0; il < model.hparams.n_text_layer; ++il) {
-            auto& layer = model.layers_decoder[il];
+// evaluate the encoder with the given state
+//
+// given audio recording (more specifically, its log mel spectrogram), runs forward pass of the encoder
+// part of the transformer model and returns the encoded features
+//
+//   - wctx:      the model
+//   - wstate:     the state of the encoder
+//   - n_threads:  number of threads to use
+//   - mel_offset: offset in the mel spectrogram (i.e. audio offset)
+//
+static bool whisper_encode_internal(
+        whisper_context & wctx,
+          whisper_state & wstate,
+              const int   mel_offset,
+              const int   n_threads,
+ whisper_abort_callback   abort_callback,
+                   void * abort_callback_data) {
+    const int64_t t_start_us = wsp_ggml_time_us();
-            wstate.use_buf(ctx0, 0);
+    // conv
+    {
+        auto & alloc = wstate.alloc_conv.alloc;
-            struct wsp_ggml_tensor* Kcross = wsp_ggml_mul_mat(ctx0,
-                layer.cross_attn_k_w,
-                cur);
+        wsp_ggml_allocr_reset(alloc);
-            Kcross = wsp_ggml_scale_inplace(ctx0, Kcross, wsp_ggml_new_f32(ctx0, pow(float(n_state) / n_head, -0.25)));
+        wsp_ggml_cgraph * gf = whisper_build_graph_conv(wctx, wstate, mel_offset);
-            wstate.use_buf(ctx0, 1);
+        wsp_ggml_allocr_alloc_graph(alloc, gf);
-            struct wsp_ggml_tensor* Vcross = wsp_ggml_mul_mat(ctx0,
-                layer.cross_attn_v_w,
-                cur);
+        if (!whisper_encode_external(wstate)) {
+            wsp_ggml_graph_compute_helper(wstate.work_buffer, gf, n_threads, abort_callback, abort_callback_data);
+        }
+    }
-            Vcross = wsp_ggml_add(ctx0,
-                wsp_ggml_repeat(ctx0,
-                    layer.cross_attn_v_b,
-                    Vcross),
-                Vcross);
+    // encoder
+    if (!whisper_encode_external(wstate)) {
+        auto & alloc = wstate.alloc_encode.alloc;
-            wstate.use_buf(ctx0, -1);
+        wsp_ggml_allocr_reset(alloc);
-            Vcross = wsp_ggml_transpose(ctx0, wsp_ggml_reshape_2d(ctx0, Vcross, n_state, n_ctx));
+        wsp_ggml_cgraph * gf = whisper_build_graph_encoder(wctx, wstate);
-            struct wsp_ggml_tensor * k = wsp_ggml_view_1d(ctx0, wstate.kv_cross.k, n_state*n_ctx, (wsp_ggml_element_size(wstate.kv_cross.k)*n_state)*(il*n_ctx));
-            struct wsp_ggml_tensor * v = wsp_ggml_view_2d(ctx0, wstate.kv_cross.v, n_ctx, n_state,
-                    (   n_ctx)*wsp_ggml_element_size(wstate.kv_cross.v),
-                    (il*n_ctx)*wsp_ggml_element_size(wstate.kv_cross.v)*n_state);
+        wsp_ggml_allocr_alloc_graph(alloc, gf);
-            wsp_ggml_build_forward_expand(&gf, wsp_ggml_cpy(ctx0, Kcross, k));
-            wsp_ggml_build_forward_expand(&gf, wsp_ggml_cpy(ctx0, Vcross, v));
+#ifdef WSP_GGML_USE_METAL
+        if (wstate.ctx_metal) {
+            wsp_ggml_metal_set_n_cb     (wstate.ctx_metal, n_threads);
+            wsp_ggml_metal_graph_compute(wstate.ctx_metal, gf);
+        } else {
+            wsp_ggml_graph_compute_helper(wstate.work_buffer, gf, n_threads, abort_callback, abort_callback_data);
         }
-        wsp_ggml_graph_compute(ctx0, &gf);
-        //wsp_ggml_graph_print(&gf);
+#else
+        wsp_ggml_graph_compute_helper(wstate.work_buffer, gf, n_threads, abort_callback, abort_callback_data);
+#endif
     }
-    ////////////////////////////////////////////////////////////////////////////
+    // cross
+    {
+        auto & alloc = wstate.alloc_cross.alloc;
-    //printf("%s: used_mem = %f MB, %f MB, %f MB %f MB %f MB\n", __func__,
-    //        wsp_ggml_used_mem(ctx0)/1024.0/1024.0,
-    //        wstate.get_buf_max_mem(0)/1024.0/1024.0,
-    //        wstate.get_buf_max_mem(1)/1024.0/1024.0,
-    //        wstate.get_buf_max_mem(2)/1024.0/1024.0,
-    //        wstate.get_buf_max_mem(3)/1024.0/1024.0);
+        wsp_ggml_allocr_reset(alloc);
-    wsp_ggml_free(ctx0);
+        wsp_ggml_cgraph * gf = whisper_build_graph_cross(wctx, wstate);
+        wsp_ggml_allocr_alloc_graph(alloc, gf);
+#ifdef WSP_GGML_USE_METAL
+        if (wstate.ctx_metal) {
+            wsp_ggml_metal_set_n_cb     (wstate.ctx_metal, n_threads);
+            wsp_ggml_metal_graph_compute(wstate.ctx_metal, gf);
+        } else {
+            wsp_ggml_graph_compute_helper(wstate.work_buffer, gf, n_threads, abort_callback, abort_callback_data);
+        }
+#else
+        wsp_ggml_graph_compute_helper(wstate.work_buffer, gf, n_threads, abort_callback, abort_callback_data);
+#endif
+    }
+    // wsp_ggml_graph_compute_with_ctx(ctx0, &gf, n_threads);
     wstate.t_encode_us += wsp_ggml_time_us() - t_start_us;
     wstate.n_encode++;
@@ -1908,26 +2002,13 @@ static bool whisper_encode_internal(
     return true;
 }
-// evaluate the decoder
-//
-// given text prompt + audio features -> computes the logits for the next token
-//
-//   - model:      the model
-//   - n_threads:  number of threads to use
-//   - tokens:     text prompt
-//   - n_tokens:   number of tokens in the prompt
-//   - n_past:     number of past tokens to prefix the prompt with
-//
-static bool whisper_decode_internal(
-        whisper_context & wctx,
-          whisper_state & wstate,
-        whisper_decoder & decoder,
-    const whisper_token * tokens,
-              const int   n_tokens,
-              const int   n_past,
-              const int   n_threads) {
-    const int64_t t_start_us = wsp_ggml_time_us();
+static struct wsp_ggml_cgraph * whisper_build_graph_decoder(
+         whisper_context & wctx,
+         whisper_state   & wstate,
+         whisper_decoder & decoder,
+     const whisper_token * tokens,
+                   int   n_tokens,
+                   int   n_past) {
     const auto & model   = wctx.model;
     const auto & hparams = model.hparams;
@@ -1935,10 +2016,6 @@ static bool whisper_decode_internal(
     WHISPER_ASSERT(!!kv_self.ctx);
-    auto & logits_out = wstate.logits;
-    const int n_vocab = hparams.n_vocab;
     const int n_ctx   = hparams.n_text_ctx;
     const int n_state = hparams.n_text_state;
     const int n_head  = hparams.n_text_head;
@@ -1950,25 +2027,39 @@ static bool whisper_decode_internal(
     //WHISPER_PRINT_DEBUG("%s: n_past = %d, N = %d, M = %d, n_ctx = %d\n", __func__, n_past, N, M, n_ctx);
     struct wsp_ggml_init_params params = {
-        /*.mem_size   =*/ wstate.buf_compute.size(),
-        /*.mem_buffer =*/ wstate.buf_compute.data(),
-        /*.no_alloc   =*/ false,
+        /*.mem_size   =*/ wstate.alloc_decode.meta.size(),
+        /*.mem_buffer =*/ wstate.alloc_decode.meta.data(),
+        /*.no_alloc   =*/ true,
     };
     struct wsp_ggml_context * ctx0 = wsp_ggml_init(params);
-    struct wsp_ggml_cgraph gf = {};
-    gf.n_threads = n_threads;
+    wsp_ggml_cgraph * gf = wsp_ggml_new_graph(ctx0);
+    wsp_ggml_allocr * alloc = wstate.alloc_decode.alloc;
     struct wsp_ggml_tensor * embd = wsp_ggml_new_tensor_1d(ctx0, WSP_GGML_TYPE_I32, N);
-    memcpy(embd->data, tokens, N*wsp_ggml_element_size(embd));
+    wsp_ggml_allocr_alloc(alloc, embd);
+    if (!wsp_ggml_allocr_is_measure(alloc)) {
+        memcpy(embd->data, tokens, N*wsp_ggml_element_size(embd));
+    }
     struct wsp_ggml_tensor * position = wsp_ggml_new_tensor_1d(ctx0, WSP_GGML_TYPE_I32, N);
-    for (int i = 0; i < N; ++i) {
-        ((int32_t *) position->data)[i] = n_past + i;
+    wsp_ggml_allocr_alloc(alloc, position);
+    if (!wsp_ggml_allocr_is_measure(alloc)) {
+        for (int i = 0; i < N; ++i) {
+            ((int32_t *) position->data)[i] = n_past + i;
+        }
     }
-    wstate.use_buf(ctx0, 3);
+    struct wsp_ggml_tensor * KQscale = wsp_ggml_new_tensor_1d(ctx0, WSP_GGML_TYPE_F32, 1);
+    wsp_ggml_allocr_alloc(alloc, KQscale);
+    if (!wsp_ggml_allocr_is_measure(alloc)) {
+        wsp_ggml_set_f32(KQscale, pow(float(n_state)/n_head, -0.25));
+    }
     // token encoding + position encoding
     struct wsp_ggml_tensor * cur =
@@ -1983,16 +2074,14 @@ static bool whisper_decode_internal(
         // norm
         {
-            wstate.use_buf(ctx0, 0);
-            cur = wsp_ggml_norm(ctx0, inpL);
+            cur = wsp_ggml_norm(ctx0, inpL, hparams.eps);
             // cur = ln_0_w*cur + ln_0_b
             cur = wsp_ggml_add(ctx0,
                     wsp_ggml_mul(ctx0,
-                        wsp_ggml_repeat(ctx0, layer.attn_ln_0_w, cur),
-                        cur),
-                    wsp_ggml_repeat(ctx0, layer.attn_ln_0_b, cur));
+                        cur,
+                        layer.attn_ln_0_w),
+                    layer.attn_ln_0_b);
         }
         // self-attention
@@ -2002,19 +2091,17 @@ static bool whisper_decode_internal(
                     cur);
             Qcur = wsp_ggml_add(ctx0,
-                    wsp_ggml_repeat(ctx0,
-                        layer.attn_q_b,
-                        Qcur),
-                    Qcur);
+                        Qcur,
+                        layer.attn_q_b);
-            Qcur = wsp_ggml_scale_inplace(ctx0, Qcur, wsp_ggml_new_f32(ctx0, pow(float(n_state)/n_head, -0.25)));
+            Qcur = wsp_ggml_scale(ctx0, Qcur, KQscale);
             // note: no bias for Key
             struct wsp_ggml_tensor * Kcur = wsp_ggml_mul_mat(ctx0,
                     layer.attn_k_w,
                     cur);
-            Kcur = wsp_ggml_scale_inplace(ctx0, Kcur, wsp_ggml_new_f32(ctx0, pow(float(n_state)/n_head, -0.25)));
+            Kcur = wsp_ggml_scale(ctx0, Kcur, KQscale);
             // store key and value to memory
             {
@@ -2023,10 +2110,8 @@ static bool whisper_decode_internal(
                         cur);
                 Vcur = wsp_ggml_add(ctx0,
-                        wsp_ggml_repeat(ctx0,
-                            layer.attn_v_b,
-                            Vcur),
-                        Vcur);
+                            Vcur,
+                            layer.attn_v_b);
                 Vcur = wsp_ggml_transpose(ctx0, wsp_ggml_reshape_2d(ctx0, Vcur, n_state, N));
@@ -2035,42 +2120,32 @@ static bool whisper_decode_internal(
                         (   n_ctx)*wsp_ggml_element_size(kv_self.v),
                         (il*n_ctx)*wsp_ggml_element_size(kv_self.v)*n_state + n_past*wsp_ggml_element_size(kv_self.v));
-                wsp_ggml_build_forward_expand(&gf, wsp_ggml_cpy(ctx0, Kcur, k));
-                wsp_ggml_build_forward_expand(&gf, wsp_ggml_cpy(ctx0, Vcur, v));
+                wsp_ggml_build_forward_expand(gf, wsp_ggml_cpy(ctx0, Kcur, k));
+                wsp_ggml_build_forward_expand(gf, wsp_ggml_cpy(ctx0, Vcur, v));
             }
             // ------
-            wstate.use_buf(ctx0, 0);
             struct wsp_ggml_tensor * Q =
                 wsp_ggml_permute(ctx0,
-                        wsp_ggml_cpy(ctx0,
-                            Qcur,
-                            wsp_ggml_new_tensor_3d(ctx0, WSP_GGML_TYPE_F32, n_state/n_head, n_head, N)),
+                        wsp_ggml_reshape_3d(ctx0, Qcur, n_state/n_head, n_head, N),
                         0, 2, 1, 3);
             struct wsp_ggml_tensor * K =
-                wsp_ggml_permute(ctx0,
-                        wsp_ggml_reshape_3d(ctx0,
-                            wsp_ggml_view_1d(ctx0, kv_self.k, (n_past + N)*n_state, il*n_ctx*wsp_ggml_element_size(kv_self.k)*n_state),
-                            n_state/n_head, n_head, n_past + N),
-                        0, 2, 1, 3);
-            wstate.use_buf(ctx0, 1);
+                wsp_ggml_view_3d(ctx0, kv_self.k,
+                        n_state/n_head, n_past + N, n_head,
+                        wsp_ggml_element_size(kv_self.k)*n_state,
+                        wsp_ggml_element_size(kv_self.k)*n_state/n_head,
+                        wsp_ggml_element_size(kv_self.k)*n_state*n_ctx*il);
             // K * Q
             struct wsp_ggml_tensor * KQ = wsp_ggml_mul_mat(ctx0, K, Q);
-            //struct wsp_ggml_tensor * KQ_scaled =
-            //    wsp_ggml_scale_inplace(ctx0,
-            //            KQ,
-            //            wsp_ggml_new_f32(ctx0, 1.0f/sqrt(float(n_state)/n_head))
-            //            );
+            //struct wsp_ggml_tensor * KQ_scaled = wsp_ggml_scale(ctx0, KQ, KQ_scale);
-            struct wsp_ggml_tensor * KQ_masked = wsp_ggml_diag_mask_inf_inplace(ctx0, KQ, n_past);
+            struct wsp_ggml_tensor * KQ_masked = wsp_ggml_diag_mask_inf(ctx0, KQ, n_past);
-            struct wsp_ggml_tensor * KQ_soft_max = wsp_ggml_soft_max_inplace(ctx0, KQ_masked);
+            struct wsp_ggml_tensor * KQ_soft_max = wsp_ggml_soft_max(ctx0, KQ_masked);
             struct wsp_ggml_tensor * V =
                 wsp_ggml_view_3d(ctx0, kv_self.v,
@@ -2090,36 +2165,28 @@ static bool whisper_decode_internal(
         // projection
         {
-            wstate.use_buf(ctx0, 0);
             cur = wsp_ggml_mul_mat(ctx0,
                     layer.attn_ln_1_w,
                     cur);
-            wstate.use_buf(ctx0, 1);
             cur = wsp_ggml_add(ctx0,
-                    wsp_ggml_repeat(ctx0, layer.attn_ln_1_b, cur),
-                    cur);
+                    cur,
+                    layer.attn_ln_1_b);
         }
-        wstate.use_buf(ctx0, 2);
         // add the input
         struct wsp_ggml_tensor * inpCA = wsp_ggml_add(ctx0, cur, inpL);
         // norm
         {
-            wstate.use_buf(ctx0, 0);
-            cur = wsp_ggml_norm(ctx0, inpCA); // note: we use inpCA here
+            cur = wsp_ggml_norm(ctx0, inpCA, hparams.eps); // note: we use inpCA here
             // cur = ln_0_w*cur + ln_0_b
             cur = wsp_ggml_add(ctx0,
                     wsp_ggml_mul(ctx0,
-                        wsp_ggml_repeat(ctx0, layer.cross_attn_ln_0_w, cur),
-                        cur),
-                    wsp_ggml_repeat(ctx0, layer.cross_attn_ln_0_b, cur));
+                        cur,
+                        layer.cross_attn_ln_0_w),
+                    layer.cross_attn_ln_0_b);
         }
         // cross-attention
@@ -2129,18 +2196,18 @@ static bool whisper_decode_internal(
                     cur);
             Qcur = wsp_ggml_add(ctx0,
-                    wsp_ggml_repeat(ctx0,
-                        layer.cross_attn_q_b,
-                        Qcur),
-                    Qcur);
+                        Qcur,
+                        layer.cross_attn_q_b);
-            Qcur = wsp_ggml_scale_inplace(ctx0, Qcur, wsp_ggml_new_f32(ctx0, pow(float(n_state)/n_head, -0.25)));
+            Qcur = wsp_ggml_scale(ctx0, Qcur, KQscale);
             // Kcross is already scaled
             struct wsp_ggml_tensor * Kcross =
-                wsp_ggml_reshape_3d(ctx0,
-                        wsp_ggml_view_1d(ctx0, wstate.kv_cross.k, M*n_state, il*M*wsp_ggml_element_size(wstate.kv_cross.k)*n_state),
-                        n_state/n_head, n_head, M);
+                wsp_ggml_view_3d(ctx0, wstate.kv_cross.k,
+                        n_state/n_head, M, n_head,
+                        wsp_ggml_element_size(wstate.kv_cross.k)*n_state,
+                        wsp_ggml_element_size(wstate.kv_cross.k)*n_state/n_head,
+                        wsp_ggml_element_size(wstate.kv_cross.k)*n_state*M*il);
             //struct wsp_ggml_tensor * Vcross =
             //    wsp_ggml_reshape_3d(ctx0,
@@ -2163,26 +2230,22 @@ static bool whisper_decode_internal(
             struct wsp_ggml_tensor * Q =
                 wsp_ggml_permute(ctx0,
-                        wsp_ggml_cpy(ctx0,
-                            Qcur,
-                            wsp_ggml_new_tensor_3d(ctx0, WSP_GGML_TYPE_F32, n_state/n_head, n_head, N)),
+                        wsp_ggml_reshape_3d(ctx0, Qcur, n_state/n_head, n_head, N),
                         0, 2, 1, 3);
-            struct wsp_ggml_tensor * K = wsp_ggml_permute(ctx0, Kcross, 0, 2, 1, 3);
             // K * Q
-            struct wsp_ggml_tensor * KQ = wsp_ggml_mul_mat(ctx0, K, Q);
+            struct wsp_ggml_tensor * KQ = wsp_ggml_mul_mat(ctx0, Kcross, Q);
             //struct wsp_ggml_tensor * KQ_scaled =
-            //    wsp_ggml_scale_inplace(ctx0,
+            //    wsp_ggml_scale(ctx0,
             //            KQ,
             //            wsp_ggml_new_f32(ctx0, 1.0f/sqrt(float(n_state)/n_head))
             //            );
             // no masking for cross-attention
-            //struct wsp_ggml_tensor * KQ_masked = wsp_ggml_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past);
+            //struct wsp_ggml_tensor * KQ_masked = wsp_ggml_diag_mask_inf(ctx0, KQ_scaled, n_past);
-            struct wsp_ggml_tensor * KQ_soft_max = wsp_ggml_soft_max_inplace(ctx0, KQ);
+            struct wsp_ggml_tensor * KQ_soft_max = wsp_ggml_soft_max(ctx0, KQ);
             struct wsp_ggml_tensor * KQV = wsp_ggml_mul_mat(ctx0, V, KQ_soft_max);
@@ -2196,21 +2259,15 @@ static bool whisper_decode_internal(
         // projection
         {
-            wstate.use_buf(ctx0, 0);
             cur = wsp_ggml_mul_mat(ctx0,
                     layer.cross_attn_ln_1_w,
                     cur);
-            wstate.use_buf(ctx0, 1);
             cur = wsp_ggml_add(ctx0,
-                    wsp_ggml_repeat(ctx0, layer.cross_attn_ln_1_b, cur),
-                    cur);
+                    cur,
+                    layer.cross_attn_ln_1_b);
         }
-        wstate.use_buf(ctx0, 2);
         // add the input
         cur = wsp_ggml_add(ctx0, cur, inpCA);
@@ -2220,54 +2277,38 @@ static bool whisper_decode_internal(
         {
             // norm
             {
-                wstate.use_buf(ctx0, 0);
-                cur = wsp_ggml_norm(ctx0, inpFF);
-                wstate.use_buf(ctx0, 1);
+                cur = wsp_ggml_norm(ctx0, inpFF, hparams.eps);
                 // cur = mlp_ln_w*cur + mlp_ln_b
                 cur = wsp_ggml_add(ctx0,
                         wsp_ggml_mul(ctx0,
-                            wsp_ggml_repeat(ctx0, layer.mlp_ln_w, cur),
-                            cur),
-                        wsp_ggml_repeat(ctx0, layer.mlp_ln_b, cur));
+                            cur,
+                            layer.mlp_ln_w),
+                        layer.mlp_ln_b);
             }
-            wstate.use_buf(ctx0, 0);
             // fully connected
             cur = wsp_ggml_mul_mat(ctx0,
                     layer.mlp_0_w,
                     cur);
-            wstate.use_buf(ctx0, 1);
             cur = wsp_ggml_add(ctx0,
-                    wsp_ggml_repeat(ctx0, layer.mlp_0_b, cur),
-                    cur);
-            wstate.use_buf(ctx0, 0);
+                    cur,
+                    layer.mlp_0_b);
             // GELU activation
             cur = wsp_ggml_gelu(ctx0, cur);
-            wstate.use_buf(ctx0, 1);
             // projection
             cur = wsp_ggml_mul_mat(ctx0,
                     layer.mlp_1_w,
                     cur);
-            wstate.use_buf(ctx0, 0);
             cur = wsp_ggml_add(ctx0,
-                    wsp_ggml_repeat(ctx0, layer.mlp_1_b, cur),
-                    cur);
+                    cur,
+                    layer.mlp_1_b);
         }
-        wstate.use_buf(ctx0, 3);
         inpL = wsp_ggml_add(ctx0, cur, inpFF);
     }
@@ -2275,21 +2316,15 @@ static bool whisper_decode_internal(
     // norm
     {
-        wstate.use_buf(ctx0, 0);
-        cur = wsp_ggml_norm(ctx0, cur);
-        wstate.use_buf(ctx0, 1);
+        cur = wsp_ggml_norm(ctx0, cur, hparams.eps);
         cur = wsp_ggml_add(ctx0,
                 wsp_ggml_mul(ctx0,
-                    wsp_ggml_repeat(ctx0, model.d_ln_w, cur),
-                    cur),
-                wsp_ggml_repeat(ctx0, model.d_ln_b, cur));
+                    cur,
+                    model.d_ln_w),
+                model.d_ln_b);
     }
-    wstate.use_buf(ctx0, 0);
     // compute logits only for the last token
     // comment this line to compute logits for all N tokens
     // might be useful in the future
@@ -2297,23 +2332,77 @@ static bool whisper_decode_internal(
     struct wsp_ggml_tensor * logits = wsp_ggml_mul_mat(ctx0, model.d_te, cur);
-    wstate.use_buf(ctx0, -1);
+    wsp_ggml_build_forward_expand(gf, logits);
+    wsp_ggml_free(ctx0);
+    return gf;
+}
+// evaluate the decoder
+//
+// given text prompt + audio features -> computes the logits for the next token
+//
+//   - model:      the model
+//   - n_threads:  number of threads to use
+//   - tokens:     text prompt
+//   - n_tokens:   number of tokens in the prompt
+//   - n_past:     number of past tokens to prefix the prompt with
+//
+static bool whisper_decode_internal(
+        whisper_context & wctx,
+          whisper_state & wstate,
+        whisper_decoder & decoder,
+    const whisper_token * tokens,
+              const int   n_tokens,
+              const int   n_past,
+              const int   n_threads,
+ whisper_abort_callback   abort_callback,
+                   void * abort_callback_data) {
+    const int64_t t_start_us = wsp_ggml_time_us();
+    const auto & model   = wctx.model;
+    const auto & hparams = model.hparams;
+    const int n_vocab = hparams.n_vocab;
+    auto & logits_out = wstate.logits;
+    struct wsp_ggml_tensor * logits;
-    // run the computation
+    // decoder
     {
-        wsp_ggml_build_forward_expand(&gf, logits);
-        wsp_ggml_graph_compute       (ctx0, &gf);
+        auto & alloc = wstate.alloc_decode.alloc;
+        wsp_ggml_allocr_reset(alloc);
+        wsp_ggml_cgraph * gf = whisper_build_graph_decoder(wctx, wstate, decoder, tokens, n_tokens, n_past);
+        wsp_ggml_allocr_alloc_graph(alloc, gf);
+        logits = gf->nodes[gf->n_nodes - 1];
+#ifdef WSP_GGML_USE_METAL
+        if (wstate.ctx_metal) {
+            wsp_ggml_metal_set_n_cb     (wstate.ctx_metal, n_threads);
+            wsp_ggml_metal_graph_compute(wstate.ctx_metal, gf);
+        } else {
+            wsp_ggml_graph_compute_helper(wstate.work_buffer, gf, n_threads, abort_callback, abort_callback_data);
+        }
+#else
+        wsp_ggml_graph_compute_helper(wstate.work_buffer, gf, n_threads, abort_callback, abort_callback_data);
+#endif
     }
     // extract logits for all N tokens
-    //logits_out.resize(N*n_vocab);
-    //memcpy(logits_out.data(), wsp_ggml_get_data(logits), sizeof(float)*N*n_vocab);
+    //logits_out.resize(n_tokens*n_vocab);
+    //memcpy(logits_out.data(), wsp_ggml_get_data(logits), sizeof(float)*n_tokens*n_vocab);
     // extract logits only for the last token
     logits_out.resize(n_vocab);
     memcpy(logits_out.data(), wsp_ggml_get_data(logits), sizeof(float)*n_vocab);
-    if (N > 1) {
+    if (n_tokens > 1) {
         //printf("%s: used_mem = %f MB, %f MB, %f MB %f MB %f MB\n", __func__,
         //        wsp_ggml_used_mem(ctx0)/1024.0/1024.0,
         //        wstate.get_buf_max_mem(0)/1024.0/1024.0,
@@ -2322,14 +2411,18 @@ static bool whisper_decode_internal(
         //        wstate.get_buf_max_mem(3)/1024.0/1024.0);
     }
-    wsp_ggml_free(ctx0);
-    wstate.t_decode_us += wsp_ggml_time_us() - t_start_us;
-    wstate.n_decode++;
+    if (n_tokens == 1) {
+        wstate.t_decode_us += wsp_ggml_time_us() - t_start_us;
+        wstate.n_decode++;
+    } else {
+        wstate.t_prompt_us += wsp_ggml_time_us() - t_start_us;
+        wstate.n_prompt++;
+    }
     return true;
 }
 //  500 -> 00:05.000
 // 6000 -> 01:00.000
 static std::string to_timestamp(int64_t t, bool comma = false) {
@@ -2351,7 +2444,7 @@ static std::string to_timestamp(int64_t t, bool comma = false) {
 static float sin_vals[SIN_COS_N_COUNT];
 static float cos_vals[SIN_COS_N_COUNT];
-// In FFT, we frequently use sine and cosine operations with the same values.
+// In FFT, we frequently use sine and cosine operations with the same values.
 // We can use precalculated values to speed up the process.
 static void fill_sin_cos_table() {
     static bool is_filled = false;
@@ -2446,7 +2539,7 @@ static void fft(const std::vector<float> & in, std::vector<float> & out) {
 }
 static bool hann_window(int length, bool periodic, std::vector<float> & output) {
-    if (output.size() < length) {
+    if (output.size() < static_cast<size_t>(length)) {
         output.resize(length);
     }
     int offset = -1;
@@ -2738,9 +2831,7 @@ struct whisper_state * whisper_init_state(whisper_context * ctx) {
     fill_sin_cos_table();
     whisper_state * state = new whisper_state;
-    const size_t scale = ctx->model.hparams.ftype ? 1 : 2;
-    if (!kv_cache_init(ctx->model.hparams, scale * MEM_REQ_KV_SELF.at(ctx->model.type), state->decoders[0].kv_self, ctx->itype, ctx->model.hparams.n_text_ctx)) {
+    if (!kv_cache_init(ctx->model.hparams, state->decoders[0].kv_self, ctx->itype, ctx->model.hparams.n_text_ctx)) {
         log("%s: kv_cache_init() failed for self-attention cache\n", __func__);
         delete state;
         return nullptr;
@@ -2751,7 +2842,7 @@ struct whisper_state * whisper_init_state(whisper_context * ctx) {
         log("%s: kv self size  = %7.2f MB\n", __func__, memory_size / 1024.0 / 1024.0);
     }
-    if (!kv_cache_init(ctx->model.hparams, scale * MEM_REQ_KV_CROSS.at(ctx->model.type), state->kv_cross, ctx->itype, ctx->model.hparams.n_audio_ctx)) {
+    if (!kv_cache_init(ctx->model.hparams, state->kv_cross, ctx->itype, ctx->model.hparams.n_audio_ctx)) {
         log("%s: kv_cache_init() failed for cross-attention cache\n", __func__);
         delete state;
         return nullptr;
@@ -2772,6 +2863,7 @@ struct whisper_state * whisper_init_state(whisper_context * ctx) {
     if (!state->ctx_coreml) {
         log("%s: failed to load Core ML model from '%s'\n", __func__, path_coreml.c_str());
 #ifndef WHISPER_COREML_ALLOW_FALLBACK
+        delete state;
         return nullptr;
 #endif
     } else {
@@ -2786,15 +2878,111 @@ struct whisper_state * whisper_init_state(whisper_context * ctx) {
     // TAGS: WHISPER_DECODER_INIT
     state->decoders[0].sequence.tokens.reserve(ctx->model.hparams.n_text_ctx);
-    state->decoders[0].probs.reserve(ctx->vocab.n_vocab);
-    state->decoders[0].logits.reserve(ctx->vocab.n_vocab);
+    state->decoders[0].probs.reserve   (ctx->vocab.n_vocab);
+    state->decoders[0].logits.reserve  (ctx->vocab.n_vocab);
     state->decoders[0].logprobs.reserve(ctx->vocab.n_vocab);
-    state->buf_compute.resize(scale * std::max(MEM_REQ_ENCODE.at(ctx->model.type), MEM_REQ_DECODE.at(ctx->model.type)));
-    state->buf_scratch[0].resize(MEM_REQ_SCRATCH0.at(ctx->model.type));
-    state->buf_scratch[1].resize(MEM_REQ_SCRATCH1.at(ctx->model.type));
-    state->buf_scratch[2].resize(MEM_REQ_SCRATCH2.at(ctx->model.type));
-    state->buf_scratch[3].resize(MEM_REQ_SCRATCH3.at(ctx->model.type));
+    // conv allocator
+    {
+        whisper_allocr_graph_init(state->alloc_conv,
+                [&]() {
+                    return whisper_build_graph_conv(*ctx, *state, 0);
+                });
+        log("%s: compute buffer (conv)   = %7.2f MB\n", __func__, whisper_allocr_size(state->alloc_conv) / 1024.0 / 1024.0);
+    }
+    // encoder allocator
+    if (!whisper_encode_external(*state)) {
+        whisper_allocr_graph_init(state->alloc_encode,
+                [&]() {
+                    return whisper_build_graph_encoder(*ctx, *state);
+                });
+        log("%s: compute buffer (encode) = %7.2f MB\n", __func__, whisper_allocr_size(state->alloc_encode) / 1024.0 / 1024.0);
+    }
+    // cross allocator
+    {
+        whisper_allocr_graph_init(state->alloc_cross,
+                [&]() {
+                    return whisper_build_graph_cross(*ctx, *state);
+                });
+        log("%s: compute buffer (cross)  = %7.2f MB\n", __func__, whisper_allocr_size(state->alloc_cross) / 1024.0 / 1024.0);
+    }
+    // decoder allocator
+    {
+        whisper_allocr_graph_init(state->alloc_decode,
+                [&]() {
+                    const auto & hparams = ctx->model.hparams;
+                    // TODO: make sure this is the worst-case scenario
+                    const int n_tokens = hparams.n_text_ctx;
+                    const int n_past   = 0;
+                    return whisper_build_graph_decoder(*ctx, *state, state->decoders[0], nullptr, n_tokens, n_past);
+                });
+        log("%s: compute buffer (decode) = %7.2f MB\n", __func__, whisper_allocr_size(state->alloc_decode) / 1024.0 / 1024.0);
+    }
+#ifdef WSP_GGML_USE_METAL
+    state->ctx_metal = wsp_ggml_metal_init(1);
+    if (!state->ctx_metal) {
+        log("%s: wsp_ggml_metal_init() failed\n", __func__);
+        delete state;
+        return nullptr;
+    }
+    log("%s: Metal context initialized\n", __func__);
+    // this allocates all Metal resources and memory buffers
+    void * data_ptr  = NULL;
+    size_t data_size = 0;
+    // TODO: add mmap support
+    //if (params.use_mmap) {
+    //    data_ptr  = ctx->model.mapping->addr;
+    //    data_size = ctx->model.mapping->size;
+    //} else {
+    //    data_ptr  = wsp_ggml_get_mem_buffer(ctx->model.ctx);
+    //    data_size = wsp_ggml_get_mem_size  (ctx->model.ctx);
+    //}
+    data_ptr  = wsp_ggml_get_mem_buffer(ctx->model.ctx);
+    data_size = wsp_ggml_get_mem_size  (ctx->model.ctx);
+    const size_t max_size = wsp_ggml_get_max_tensor_size(ctx->model.ctx);
+    log("%s: max tensor size = %8.2f MB\n", __func__, max_size/1024.0/1024.0);
+#define WHISPER_METAL_CHECK_BUF(result)              \
+    if (!(result)) {                                 \
+        log("%s: failed to add metal buffer\n", __func__); \
+        delete state;                                \
+        return nullptr;                              \
+    }
+    WHISPER_METAL_CHECK_BUF(wsp_ggml_metal_add_buffer(state->ctx_metal, "data", data_ptr, data_size, max_size));
+    WHISPER_METAL_CHECK_BUF(wsp_ggml_metal_add_buffer(state->ctx_metal, "meta_conv",   state->alloc_conv.meta.data(),   state->alloc_conv.meta.size(),   0));
+    WHISPER_METAL_CHECK_BUF(wsp_ggml_metal_add_buffer(state->ctx_metal, "meta_encode", state->alloc_encode.meta.data(), state->alloc_encode.meta.size(), 0));
+    WHISPER_METAL_CHECK_BUF(wsp_ggml_metal_add_buffer(state->ctx_metal, "meta_cross",  state->alloc_cross.meta.data(),  state->alloc_cross.meta.size(),  0));
+    WHISPER_METAL_CHECK_BUF(wsp_ggml_metal_add_buffer(state->ctx_metal, "meta_decode", state->alloc_decode.meta.data(), state->alloc_decode.meta.size(), 0));
+    WHISPER_METAL_CHECK_BUF(wsp_ggml_metal_add_buffer(state->ctx_metal, "data_conv",   state->alloc_conv.data.data(),   state->alloc_conv.data.size(),   0));
+    WHISPER_METAL_CHECK_BUF(wsp_ggml_metal_add_buffer(state->ctx_metal, "data_encode", state->alloc_encode.data.data(), state->alloc_encode.data.size(), 0));
+    WHISPER_METAL_CHECK_BUF(wsp_ggml_metal_add_buffer(state->ctx_metal, "data_cross",  state->alloc_cross.data.data(),  state->alloc_cross.data.size(),  0));
+    WHISPER_METAL_CHECK_BUF(wsp_ggml_metal_add_buffer(state->ctx_metal, "data_decode", state->alloc_decode.data.data(), state->alloc_decode.data.size(), 0));
+    WHISPER_METAL_CHECK_BUF(wsp_ggml_metal_add_buffer(state->ctx_metal, "kv_cross",  state->kv_cross.buf.data(), state->kv_cross.buf.size(), 0));
+    WHISPER_METAL_CHECK_BUF(wsp_ggml_metal_add_buffer(state->ctx_metal, "kv_self_0", state->decoders[0].kv_self.buf.data(), state->decoders[0].kv_self.buf.size(), 0));
+#undef WHISPER_METAL_CHECK_BUF
+#endif
     state->rng = std::mt19937(0);
@@ -2851,7 +3039,6 @@ int whisper_ctx_init_openvino_encoder(
 }
 struct whisper_context * whisper_init_from_file_no_state(const char * path_model) {
     log("%s: loading model from '%s'\n", __func__, path_model);
     auto fin = std::ifstream(path_model, std::ios::binary);
@@ -3004,6 +3191,13 @@ void whisper_free_state(struct whisper_state * state)
         }
 #endif
+#ifdef WSP_GGML_USE_METAL
+        if (state->ctx_metal) {
+            wsp_ggml_metal_free(state->ctx_metal);
+            state->ctx_metal = nullptr;
+        }
+#endif
 #ifdef WHISPER_USE_OPENVINO
         if (state->ctx_openvino != nullptr) {
             whisper_openvino_free(state->ctx_openvino);
@@ -3011,6 +3205,11 @@ void whisper_free_state(struct whisper_state * state)
         }
 #endif
+        whisper_allocr_free(state->alloc_conv);
+        whisper_allocr_free(state->alloc_decode);
+        whisper_allocr_free(state->alloc_cross);
+        whisper_allocr_free(state->alloc_encode);
         delete state;
     }
 }
@@ -3103,7 +3302,7 @@ int whisper_set_mel(
 }
 int whisper_encode_with_state(struct whisper_context * ctx, struct whisper_state * state, int offset, int n_threads) {
-    if (!whisper_encode_internal(*ctx, *state, offset, n_threads)) {
+    if (!whisper_encode_internal(*ctx, *state, offset, n_threads, nullptr, nullptr)) {
         log("%s: failed to eval\n", __func__);
         return -1;
     }
@@ -3112,7 +3311,7 @@ int whisper_encode_with_state(struct whisper_context * ctx, struct whisper_state
 }
 int whisper_encode(struct whisper_context * ctx, int offset, int n_threads) {
-    if (!whisper_encode_internal(*ctx, *ctx->state, offset, n_threads)) {
+    if (!whisper_encode_internal(*ctx, *ctx->state, offset, n_threads, nullptr, nullptr)) {
         log("%s: failed to eval\n", __func__);
         return -1;
     }
@@ -3123,7 +3322,7 @@ int whisper_encode(struct whisper_context * ctx, int offset, int n_threads) {
 int whisper_decode_with_state(struct whisper_context * ctx, struct whisper_state * state, const whisper_token * tokens, int n_tokens, int n_past, int n_threads) {
     const int selected_decoder_id = 0;
-    if (!whisper_decode_internal(*ctx, *state, state->decoders[selected_decoder_id], tokens, n_tokens, n_past, n_threads)) {
+    if (!whisper_decode_internal(*ctx, *state, state->decoders[selected_decoder_id], tokens, n_tokens, n_past, n_threads, nullptr, nullptr)) {
         log("%s: failed to eval\n", __func__);
         return 1;
     }
@@ -3140,7 +3339,7 @@ int whisper_decode(struct whisper_context * ctx, const whisper_token * tokens, i
         return false;
     }
-    if (!whisper_decode_internal(*ctx, *ctx->state, ctx->state->decoders[selected_decoder_id], tokens, n_tokens, n_past, n_threads)) {
+    if (!whisper_decode_internal(*ctx, *ctx->state, ctx->state->decoders[selected_decoder_id], tokens, n_tokens, n_past, n_threads, nullptr, nullptr)) {
         log("%s: failed to eval\n", __func__);
         return 1;
     }
@@ -3431,12 +3630,14 @@ void whisper_print_timings(struct whisper_context * ctx) {
         const int32_t n_sample = std::max(1, ctx->state->n_sample);
         const int32_t n_encode = std::max(1, ctx->state->n_encode);
         const int32_t n_decode = std::max(1, ctx->state->n_decode);
+        const int32_t n_prompt = std::max(1, ctx->state->n_prompt);
         log("%s:     fallbacks = %3d p / %3d h\n", __func__, ctx->state->n_fail_p, ctx->state->n_fail_h);
         log("%s:      mel time = %8.2f ms\n", __func__, ctx->state->t_mel_us / 1000.0f);
         log("%s:   sample time = %8.2f ms / %5d runs (%8.2f ms per run)\n", __func__, 1e-3f * ctx->state->t_sample_us, n_sample, 1e-3f * ctx->state->t_sample_us / n_sample);
         log("%s:   encode time = %8.2f ms / %5d runs (%8.2f ms per run)\n", __func__, 1e-3f * ctx->state->t_encode_us, n_encode, 1e-3f * ctx->state->t_encode_us / n_encode);
         log("%s:   decode time = %8.2f ms / %5d runs (%8.2f ms per run)\n", __func__, 1e-3f * ctx->state->t_decode_us, n_decode, 1e-3f * ctx->state->t_decode_us / n_decode);
+        log("%s:   prompt time = %8.2f ms / %5d runs (%8.2f ms per run)\n", __func__, 1e-3f * ctx->state->t_prompt_us, n_prompt, 1e-3f * ctx->state->t_prompt_us / n_prompt);
     }
     log("%s:    total time = %8.2f ms\n", __func__, (t_end_us - ctx->t_start_us)/1000.0f);
 }
@@ -3446,6 +3647,11 @@ void whisper_reset_timings(struct whisper_context * ctx) {
         ctx->state->t_sample_us = 0;
         ctx->state->t_encode_us = 0;
         ctx->state->t_decode_us = 0;
+        ctx->state->t_prompt_us = 0;
+        ctx->state->n_sample = 0;
+        ctx->state->n_encode = 0;
+        ctx->state->n_decode = 0;
+        ctx->state->n_prompt = 0;
     }
 }
@@ -3475,6 +3681,7 @@ const char * whisper_print_system_info(void) {
     s += "FMA = "       + std::to_string(wsp_ggml_cpu_has_fma())       + " | ";
     s += "NEON = "      + std::to_string(wsp_ggml_cpu_has_neon())      + " | ";
     s += "ARM_FMA = "   + std::to_string(wsp_ggml_cpu_has_arm_fma())   + " | ";
+    s += "METAL = "     + std::to_string(wsp_ggml_cpu_has_metal())     + " | ";
     s += "F16C = "      + std::to_string(wsp_ggml_cpu_has_f16c())      + " | ";
     s += "FP16_VA = "   + std::to_string(wsp_ggml_cpu_has_fp16_va())   + " | ";
     s += "WASM_SIMD = " + std::to_string(wsp_ggml_cpu_has_wasm_simd()) + " | ";
@@ -3566,6 +3773,9 @@ struct whisper_full_params whisper_full_default_params(enum whisper_sampling_str
         /*.encoder_begin_callback           =*/ nullptr,
         /*.encoder_begin_callback_user_data =*/ nullptr,
+        /*.abort_callback           =*/ nullptr,
+        /*.abort_callback_user_data =*/ nullptr,
         /*.logits_filter_callback           =*/ nullptr,
         /*.logits_filter_callback_user_data =*/ nullptr,
     };
@@ -3970,17 +4180,21 @@ static std::vector<whisper_token_data> whisper_sample_token_topk(
     auto & logits_id = state.logits_id;
-    logits_id.clear();
+    logits_id.resize(n_logits);
     for (int i = 0; i < n_logits; ++i) {
-        logits_id.push_back({ logits[i], i });
+        logits_id[i].first = logits[i];
+        logits_id[i].second = i;
     }
-    std::partial_sort(
-            logits_id.begin(),
-            logits_id.begin() + k, logits_id.end(),
-            [](const std::pair<double, whisper_token> & a, const std::pair<double, whisper_token> & b) {
-                return a.first > b.first;
-            });
+    {
+        using pair_type = std::remove_reference<decltype(logits_id)>::type::value_type;
+        std::partial_sort(
+                logits_id.begin(),
+                logits_id.begin() + k, logits_id.end(),
+                [](const pair_type & a, const pair_type & b) {
+            return a.first > b.first;
+        });
+    }
     std::vector<whisper_token_data> result;
     result.reserve(k);
@@ -4075,6 +4289,115 @@ static void whisper_sequence_score(
     }
 }
+static bool whisper_kv_swap_fast(
+                   std::vector<int> & view,
+                    whisper_decoder   src[],
+                std::vector<kv_buf> & kv_swap_bufs,
+                          const int & n_decoders) {
+    WHISPER_PRINT_DEBUG("%s: n_decoders %d\n", __func__, n_decoders);
+    // (decoder->buffer->decoder or decoder->buffer + decoder->decoder)
+    std::set<int> two_copy; // decoder indices require two copies to safely modify KV caches
+    // (buffer->decoder or decoder->decoder)
+    std::set<int> one_copy; // decoder indices require one copy to safely modify KV caches
+    // (decoder<->decoder)
+    std::set<int> p_swap_set; // decoder indices able to swap KV-cache pointers
+    std::vector<whisper_pair<int, int>> p_swap_vec;
+    p_swap_vec.reserve(n_decoders);
+    // see https://github.com/ggerganov/whisper.cpp/wiki
+    for (int i = 0; i < n_decoders; i++) {
+        // zero-copy (no modification)
+        if (i == view[i] || view[i] < 0) {
+            continue;
+        }
+        bool is_one_copy = true;
+        // since we modify data sequentially, we only consider decoder indices after current index
+        for (int j = i + 1; j < n_decoders; j++) {
+            if (i == view[j]) {
+                // detect symmetric diagram
+                if (j == view[i]) {
+                    p_swap_set.insert(i);
+                    p_swap_set.insert(j);
+                    p_swap_vec.emplace_back(i, j);
+                } else {
+                    two_copy.insert(i);
+                    is_one_copy = false;
+                }
+                break;
+            }
+        }
+        if (is_one_copy) {
+            one_copy.insert(i);
+        }
+    }
+    kv_swap_bufs.resize(n_decoders);
+    for (int i = 0; i < n_decoders; i++) {
+        kv_swap_bufs[i].k.resize(wsp_ggml_nbytes(src[i].kv_self.k));
+        kv_swap_bufs[i].v.resize(wsp_ggml_nbytes(src[i].kv_self.v));
+    }
+    for (auto & i : two_copy) {
+        // make a copy of KV caches
+        WHISPER_PRINT_DEBUG("%s: store KV cache into swap: idx %d\n", __func__, i);
+        memcpy(kv_swap_bufs[i].k.data(), src[i].kv_self.k->data, kv_swap_bufs[i].k.size());
+        memcpy(kv_swap_bufs[i].v.data(), src[i].kv_self.v->data, kv_swap_bufs[i].v.size());
+    }
+    // since two-copy decoder KV caches are protected by kv_swap_bufs, modify them first
+    for (auto & i : two_copy) {
+        // skip the decoder indices that require pointer swapping
+        if (p_swap_set.find(i) != p_swap_set.end()) {
+            continue;
+        }
+        if (two_copy.find(view[i]) != two_copy.end()) {
+            // modify KV caches of decoder using data from kv_swap_bufs
+            WHISPER_PRINT_DEBUG("%s: two-copy decoder using   swap buffers: swap[%d] -> %d\n", __func__, view[i], i);
+            memcpy(src[i].kv_self.k->data, kv_swap_bufs[view[i]].k.data(), kv_swap_bufs[view[i]].k.size());
+            memcpy(src[i].kv_self.v->data, kv_swap_bufs[view[i]].v.data(), kv_swap_bufs[view[i]].v.size());
+        } else {
+            // modify KV caches of decoder using data from correspond decoder KV caches directly
+            WHISPER_PRINT_DEBUG("%s: two-copy decoder without swap buffers:      %d  -> %d\n", __func__, view[i], i);
+            memcpy(src[i].kv_self.k->data, src[view[i]].kv_self.k->data, wsp_ggml_nbytes(src[view[i]].kv_self.k));
+            memcpy(src[i].kv_self.v->data, src[view[i]].kv_self.v->data, wsp_ggml_nbytes(src[view[i]].kv_self.v));
+        }
+    }
+    // then modify one-copy decoder KV caches
+    for (auto & i : one_copy) {
+        // skip the decoder indices that require pointer swapping
+        if (p_swap_set.find(i) != p_swap_set.end()) {
+            continue;
+        }
+        if (two_copy.find(view[i]) != two_copy.end()) {
+            // modify KV caches of decoder using data from kv_swap_bufs
+            WHISPER_PRINT_DEBUG("%s: one-copy decoder using   swap buffers: swap[%d] -> %d\n", __func__, view[i], i);
+            memcpy(src[i].kv_self.k->data, kv_swap_bufs[view[i]].k.data(), kv_swap_bufs[view[i]].k.size());
+            memcpy(src[i].kv_self.v->data, kv_swap_bufs[view[i]].v.data(), kv_swap_bufs[view[i]].v.size());
+        } else {
+            // modify KV caches of decoder using data from correspond decoder KV caches directly
+            WHISPER_PRINT_DEBUG("%s: one-copy decoder without swap buffers:      %d  -> %d\n", __func__, view[i], i);
+            memcpy(src[i].kv_self.k->data, src[view[i]].kv_self.k->data, wsp_ggml_nbytes(src[view[i]].kv_self.k));
+            memcpy(src[i].kv_self.v->data, src[view[i]].kv_self.v->data, wsp_ggml_nbytes(src[view[i]].kv_self.v));
+        }
+    }
+    // swap the pointers
+    for (auto & i : p_swap_vec) {
+        WHISPER_PRINT_DEBUG("%s: swap pointers: %d <-> %d\n", __func__, i.first, i.second);
+        std::swap(src[i.first].kv_self, src[i.second].kv_self);
+    }
+    return true;
+}
 int whisper_full_with_state(
         struct whisper_context * ctx,
           struct whisper_state * state,
@@ -4182,6 +4505,21 @@ int whisper_full_with_state(
             decoder.probs.resize   (ctx->vocab.n_vocab);
             decoder.logits.resize  (ctx->vocab.n_vocab);
             decoder.logprobs.resize(ctx->vocab.n_vocab);
+            // TODO: not very clean - look for a better way and potentially merging with the init of decoder 0
+#ifdef WSP_GGML_USE_METAL
+#define WHISPER_METAL_CHECK_BUF(result)              \
+            if (!(result)) {                                 \
+                log("%s: failed to add metal buffer\n", __func__); \
+                return 0;                              \
+            }
+            const std::string kv_name = "kv_self_" + std::to_string(j);
+            auto & kv_self = decoder.kv_self;
+            WHISPER_METAL_CHECK_BUF(wsp_ggml_metal_add_buffer(state->ctx_metal, kv_name.c_str(), kv_self.buf.data(), kv_self.buf.size(), 0));
+#undef WHISPER_METAL_CHECK_BUF
+#endif
         }
     }
@@ -4197,7 +4535,7 @@ int whisper_full_with_state(
         // initial prompt
         if (!params.prompt_tokens && params.initial_prompt) {
-            prompt_tokens.resize(1024);
+            prompt_tokens.resize(2048);
             prompt_tokens.resize(whisper_tokenize(ctx, params.initial_prompt, prompt_tokens.data(), prompt_tokens.size()));
             params.prompt_tokens   = prompt_tokens.data();
             params.prompt_n_tokens = prompt_tokens.size();
@@ -4238,14 +4576,6 @@ int whisper_full_with_state(
     std::vector<whisper_token> prompt;
     prompt.reserve(whisper_n_text_ctx(ctx));
-    // beam-search helpers
-    struct kv_buf {
-        std::vector<uint8_t> k;
-        std::vector<uint8_t> v;
-    };
-    std::vector<kv_buf> kv_bufs;
     struct beam_candidate {
         int decoder_idx;
         int seek_delta;
@@ -4279,7 +4609,7 @@ int whisper_full_with_state(
         }
         // encode audio features starting at offset seek
-        if (!whisper_encode_internal(*ctx, *state, seek, params.n_threads)) {
+        if (!whisper_encode_internal(*ctx, *state, seek, params.n_threads, params.abort_callback, params.abort_callback_user_data)) {
             log("%s: failed to encode\n", __func__);
             return -6;
         }
@@ -4362,7 +4692,7 @@ int whisper_full_with_state(
                 }
                 WHISPER_PRINT_DEBUG("\n\n");
-                if (!whisper_decode_internal(*ctx, *state, state->decoders[0], prompt.data(), prompt.size(), 0, params.n_threads)) {
+                if (!whisper_decode_internal(*ctx, *state, state->decoders[0], prompt.data(), prompt.size(), 0, params.n_threads, params.abort_callback, params.abort_callback_user_data)) {
                     log("%s: failed to decode\n", __func__);
                     return -7;
                 }
@@ -4382,8 +4712,8 @@ int whisper_full_with_state(
                         decoder.kv_self.n += prompt.size();
-                        memcpy(decoder.probs.data(), state->decoders[0].probs.data(),    decoder.probs.size()*sizeof(decoder.probs[0]));
-                        memcpy(decoder.logits.data(), state->decoders[0].logits.data(),   decoder.logits.size()*sizeof(decoder.logits[0]));
+                        memcpy(decoder.probs.data(),    state->decoders[0].probs.data(),    decoder.probs.size()*sizeof(decoder.probs[0]));
+                        memcpy(decoder.logits.data(),   state->decoders[0].logits.data(),   decoder.logits.size()*sizeof(decoder.logits[0]));
                         memcpy(decoder.logprobs.data(), state->decoders[0].logprobs.data(), decoder.logprobs.size()*sizeof(decoder.logprobs[0]));
                     }
@@ -4394,23 +4724,7 @@ int whisper_full_with_state(
             for (int i = 0, n_max = whisper_n_text_ctx(ctx)/2 - 4; i < n_max; ++i) {
                 const int64_t t_start_sample_us = wsp_ggml_time_us();
-                // store the KV caches of all decoders when doing beam-search
                 if (params.strategy == whisper_sampling_strategy::WHISPER_SAMPLING_BEAM_SEARCH) {
-                    kv_bufs.resize(n_decoders_cur);
-                    for (int j = 0; j < n_decoders_cur; ++j) {
-                        auto & decoder = state->decoders[j];
-                        if (decoder.completed || decoder.failed) {
-                            continue;
-                        }
-                        kv_bufs[j].k.resize(wsp_ggml_nbytes(decoder.kv_self.k));
-                        kv_bufs[j].v.resize(wsp_ggml_nbytes(decoder.kv_self.v));
-                        memcpy(kv_bufs[j].k.data(), decoder.kv_self.k->data, kv_bufs[j].k.size());
-                        memcpy(kv_bufs[j].v.data(), decoder.kv_self.v->data, kv_bufs[j].v.size());
-                    }
                     beam_candidates.clear();
                 }
@@ -4458,6 +4772,7 @@ int whisper_full_with_state(
                     });
                     uint32_t cur_c = 0;
+                    std::vector<int> decoder_idx(n_decoders_cur, -1);
                     for (int j = 0; j < n_decoders_cur; ++j) {
                         auto & decoder = state->decoders[j];
@@ -4476,12 +4791,13 @@ int whisper_full_with_state(
                         decoder.seek_delta = cur.seek_delta;
                         decoder.has_ts     = cur.has_ts;
-                        memcpy(decoder.kv_self.k->data, kv_bufs[cur.decoder_idx].k.data(), kv_bufs[cur.decoder_idx].k.size());
-                        memcpy(decoder.kv_self.v->data, kv_bufs[cur.decoder_idx].v.data(), kv_bufs[cur.decoder_idx].v.size());
+                        decoder_idx[j] = cur.decoder_idx;
                         WHISPER_PRINT_DEBUG("%s: beam search: decoder %d: from decoder %d: token = %10s, plog = %8.5f, sum_logprobs = %8.5f\n",
                                 __func__, j, cur.decoder_idx, ctx->vocab.id_to_token.at(decoder.sequence.tokens.back().id).c_str(), decoder.sequence.tokens.back().plog, decoder.sequence.sum_logprobs_all);
                     }
+                    // update KV caches
+                    whisper_kv_swap_fast(decoder_idx, state->decoders, state->kv_swap_bufs, n_decoders_cur);
                 }
                 // update the decoder state
@@ -4600,7 +4916,7 @@ int whisper_full_with_state(
                     //WHISPER_PRINT_DEBUG("%s: decoder %d: token %d, kv_self.n %d, seek_delta %d\n", __func__, j, decoder.tokens_tmp[0], decoder.kv_self.n, decoder.seek_delta);
-                    if (!whisper_decode_internal(*ctx, *state, decoder, decoder.tokens_tmp.data(), decoder.tokens_tmp.size(), decoder.kv_self.n, params.n_threads)) {
+                    if (!whisper_decode_internal(*ctx, *state, decoder, decoder.tokens_tmp.data(), decoder.tokens_tmp.size(), decoder.kv_self.n, params.n_threads, params.abort_callback, params.abort_callback_user_data)) {
                         log("%s: failed to decode\n", __func__);
                         return -8;
                     }
@@ -4910,6 +5226,12 @@ int whisper_full_parallel(
         ctx->state->t_sample_us += states[i]->t_sample_us;
         ctx->state->t_encode_us += states[i]->t_encode_us;
         ctx->state->t_decode_us += states[i]->t_decode_us;
+        ctx->state->t_prompt_us += states[i]->t_prompt_us;
+        ctx->state->n_sample += states[i]->n_sample;
+        ctx->state->n_encode += states[i]->n_encode;
+        ctx->state->n_decode += states[i]->n_decode;
+        ctx->state->n_prompt += states[i]->n_prompt;
         whisper_free_state(states[i]);
     }
@@ -4963,6 +5285,10 @@ int64_t whisper_full_get_segment_t1(struct whisper_context * ctx, int i_segment)
     return ctx->state->result_all[i_segment].t1;
 }
+bool whisper_full_get_segment_speaker_turn_next_from_state(struct whisper_state * state, int i_segment) {
+    return state->result_all[i_segment].speaker_turn_next;
+}
 bool whisper_full_get_segment_speaker_turn_next(struct whisper_context * ctx, int i_segment) {
     return ctx->state->result_all[i_segment].speaker_turn_next;
 }
@@ -5106,7 +5432,8 @@ WHISPER_API const char * whisper_bench_wsp_ggml_mul_mat_str(int n_threads) {
     // b: N*N*sizeof(float)
     // c: N*N*sizeof(float)
     // when F16 is used, there is an extra work buffer of size N*N*sizeof(float)
-    std::vector<char> buf(4llu*N_max*N_max*sizeof(float) + 4*512);
+    std::vector<uint8_t> buf(3llu*N_max*N_max*sizeof(float) + 3*wsp_ggml_tensor_overhead());
+    std::vector<uint8_t> work;
     // put a bunch of random data in the buffer
     for (size_t i = 0; i < buf.size(); i++) buf[i] = i;
@@ -5158,17 +5485,15 @@ WHISPER_API const char * whisper_bench_wsp_ggml_mul_mat_str(int n_threads) {
             struct wsp_ggml_cgraph gf = wsp_ggml_build_forward(c);
-            gf.n_threads = n_threads;
             double tsum = 0.0;
             // heat-up
-            wsp_ggml_graph_compute(ctx0, &gf);
+            wsp_ggml_graph_compute_helper(work, &gf, n_threads, nullptr , nullptr);
             for (int i = 0; i < n_max; ++i) {
                 const int64_t t0 = wsp_ggml_time_us();
-                wsp_ggml_graph_compute(ctx0, &gf);
+                wsp_ggml_graph_compute_helper(work, &gf, n_threads, nullptr, nullptr);
                 const int64_t t1 = wsp_ggml_time_us();