npm - cui-llama.rn - Versions diffs - 1.1.2 → 1.1.4 - Mend

cui-llama.rn 1.1.2 → 1.1.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (31) hide show

package/android/src/main/CMakeLists.txt +1 -2
package/android/src/main/jni.cpp +26 -21
package/cpp/common.cpp +2028 -1520
package/cpp/common.h +134 -18
package/cpp/ggml-aarch64.c +612 -0
package/cpp/ggml-alloc.h +2 -2
package/cpp/ggml-backend.c +33 -6
package/cpp/ggml-backend.h +2 -0
package/cpp/ggml-common.h +20 -0
package/cpp/ggml-impl.h +4 -7
package/cpp/ggml-metal.m +63 -2
package/cpp/ggml-quants.c +690 -2
package/cpp/ggml-quants.h +15 -0
package/cpp/ggml.c +1650 -317
package/cpp/ggml.h +155 -48
package/cpp/llama-grammar.cpp +721 -122
package/cpp/llama-grammar.h +120 -15
package/cpp/llama-impl.h +132 -1
package/cpp/llama-sampling.cpp +1361 -356
package/cpp/llama-sampling.h +20 -48
package/cpp/llama-vocab.cpp +140 -7
package/cpp/llama-vocab.h +3 -2
package/cpp/llama.cpp +810 -307
package/cpp/llama.h +213 -259
package/cpp/rn-llama.hpp +17 -14
package/cpp/sampling.cpp +347 -355
package/cpp/sampling.h +106 -135
package/cpp/sgemm.cpp +153 -0
package/package.json +1 -1
package/cpp/grammar-parser.cpp +0 -539
package/cpp/grammar-parser.h +0 -29

package/cpp/ggml.c CHANGED Viewed

@@ -69,23 +69,42 @@ int lm_ggml_sve_cnt_b = 0;
 #endif
 #include <windows.h>
+#if !defined(__clang__)
 typedef volatile LONG atomic_int;
 typedef atomic_int atomic_bool;
 typedef atomic_int atomic_flag;
 #define ATOMIC_FLAG_INIT 0
+typedef enum {
+    memory_order_relaxed,
+    memory_order_consume,
+    memory_order_acquire,
+    memory_order_release,
+    memory_order_acq_rel,
+    memory_order_seq_cst
+} memory_order;
 static void atomic_store(atomic_int * ptr, LONG val) {
     InterlockedExchange(ptr, val);
 }
+static void atomic_store_explicit(atomic_int * ptr, LONG val, memory_order mo) {
+    // TODO: add support for explicit memory order
+    InterlockedExchange(ptr, val);
+}
 static LONG atomic_load(atomic_int * ptr) {
     return InterlockedCompareExchange(ptr, 0, 0);
 }
+static LONG atomic_load_explicit(atomic_int * ptr, memory_order mo) {
+    // TODO: add support for explicit memory order
+    return InterlockedCompareExchange(ptr, 0, 0);
+}
 static LONG atomic_fetch_add(atomic_int * ptr, LONG inc) {
     return InterlockedExchangeAdd(ptr, inc);
 }
-static LONG atomic_fetch_sub(atomic_int * ptr, LONG dec) {
-    return atomic_fetch_add(ptr, -(dec));
+static LONG atomic_fetch_add_explicit(atomic_int * ptr, LONG inc, memory_order mo) {
+    // TODO: add support for explicit memory order
+    return InterlockedExchangeAdd(ptr, inc);
 }
 static atomic_bool atomic_flag_test_and_set(atomic_flag * ptr) {
     return InterlockedExchange(ptr, 1);
@@ -93,6 +112,9 @@ static atomic_bool atomic_flag_test_and_set(atomic_flag * ptr) {
 static void atomic_flag_clear(atomic_flag * ptr) {
     InterlockedExchange(ptr, 0);
 }
+#else // clang
+#include <stdatomic.h>
+#endif
 typedef HANDLE pthread_t;
@@ -121,8 +143,13 @@ static int sched_yield (void) {
     return 0;
 }
 #else
 #include <pthread.h>
 #include <stdatomic.h>
+#include <sched.h>
+#if defined(__FreeBSD__)
+#include <pthread_np.h>
+#endif
 typedef void * thread_ret_t;
@@ -1027,7 +1054,31 @@ static const lm_ggml_type_traits_t type_traits[LM_GGML_TYPE_COUNT] = {
         .ncols                    = 8,
         .gemv                     = lm_ggml_gemv_q4_0_8x8_q8_0,
         .gemm                     = lm_ggml_gemm_q4_0_8x8_q8_0,
-    }
+    },
+    [LM_GGML_TYPE_TQ1_0] = {
+        .type_name                = "tq1_0",
+        .blck_size                = QK_K,
+        .type_size                = sizeof(block_tq1_0),
+        .is_quantized             = true,
+        .to_float                 = (lm_ggml_to_float_t) dequantize_row_tq1_0,
+        .from_float               = quantize_row_tq1_0,
+        .from_float_ref           = (lm_ggml_from_float_t) quantize_row_tq1_0_ref,
+        .vec_dot                  = lm_ggml_vec_dot_tq1_0_q8_K,
+        .vec_dot_type             = LM_GGML_TYPE_Q8_K,
+        .nrows                    = 1,
+    },
+    [LM_GGML_TYPE_TQ2_0] = {
+        .type_name                = "tq2_0",
+        .blck_size                = QK_K,
+        .type_size                = sizeof(block_tq2_0),
+        .is_quantized             = true,
+        .to_float                 = (lm_ggml_to_float_t) dequantize_row_tq2_0,
+        .from_float               = quantize_row_tq2_0,
+        .from_float_ref           = (lm_ggml_from_float_t) quantize_row_tq2_0_ref,
+        .vec_dot                  = lm_ggml_vec_dot_tq2_0_q8_K,
+        .vec_dot_type             = LM_GGML_TYPE_Q8_K,
+        .nrows                    = 1,
+    },
 };
 // For internal test use
@@ -1868,28 +1919,102 @@ struct lm_ggml_context_container {
     struct lm_ggml_context context;
 };
-struct lm_ggml_compute_state_shared {
-    const struct lm_ggml_cgraph * cgraph;
-    const struct lm_ggml_cplan * cplan;
+//
+// Threading defs
+//
+typedef pthread_t          lm_ggml_thread_t;
+#if defined(_WIN32)
+typedef CONDITION_VARIABLE lm_ggml_cond_t;
+typedef SRWLOCK            lm_ggml_mutex_t;
+#define lm_ggml_mutex_init(m)   InitializeSRWLock(m)
+#define lm_ggml_mutex_destroy(m)
+#define lm_ggml_mutex_lock(m)   AcquireSRWLockExclusive(m)
+#define lm_ggml_mutex_unlock(m) ReleaseSRWLockExclusive(m)
+#define lm_ggml_mutex_lock_shared(m)   AcquireSRWLockShared(m)
+#define lm_ggml_mutex_unlock_shared(m) ReleaseSRWLockShared(m)
+#define lm_ggml_cond_init(c)    InitializeConditionVariable(c)
+#define lm_ggml_cond_destroy(c)
+#define lm_ggml_cond_wait(c, m) SleepConditionVariableSRW(c, m, INFINITE, CONDITION_VARIABLE_LOCKMODE_SHARED)
+#define lm_ggml_cond_broadcast(c) WakeAllConditionVariable(c)
+#define lm_ggml_thread_create pthread_create
+#define lm_ggml_thread_join   pthread_join
+#else
+typedef pthread_cond_t     lm_ggml_cond_t;
+typedef pthread_mutex_t    lm_ggml_mutex_t;
+#define lm_ggml_mutex_init(m)          pthread_mutex_init(m, NULL)
+#define lm_ggml_mutex_destroy(m)       pthread_mutex_destroy(m)
+#define lm_ggml_mutex_lock(m)          pthread_mutex_lock(m)
+#define lm_ggml_mutex_unlock(m)        pthread_mutex_unlock(m)
+#define lm_ggml_mutex_lock_shared(m)   pthread_mutex_lock(m)
+#define lm_ggml_mutex_unlock_shared(m) pthread_mutex_unlock(m)
+#define lm_ggml_lock_init(x)    UNUSED(x)
+#define lm_ggml_lock_destroy(x) UNUSED(x)
+#if defined(__x86_64__) || (defined(_MSC_VER) && defined(_M_AMD64))
+#define lm_ggml_lock_lock(x)    _mm_pause()
+#else
+#define lm_ggml_lock_lock(x)    UNUSED(x)
+#endif
+#define lm_ggml_lock_unlock(x)  UNUSED(x)
+#define LM_GGML_LOCK_INITIALIZER 0
+#define lm_ggml_cond_init(c)      pthread_cond_init(c, NULL)
+#define lm_ggml_cond_destroy(c)   pthread_cond_destroy(c)
+#define lm_ggml_cond_wait(c, m)   pthread_cond_wait(c, m)
+#define lm_ggml_cond_broadcast(c) pthread_cond_broadcast(c)
+#define lm_ggml_thread_create pthread_create
+#define lm_ggml_thread_join   pthread_join
+#endif
+// Threadpool def
+struct lm_ggml_threadpool {
+    lm_ggml_mutex_t mutex;       // mutex for cond.var
+    lm_ggml_cond_t  cond;        // cond.var for waiting for new work
-    int n_threads;
+    struct lm_ggml_cgraph * cgraph;
+    struct lm_ggml_cplan  * cplan;
     // synchronization primitives
+    atomic_int n_graph;       // incremented when there is work to be done (i.e each graph)
     atomic_int n_barrier;
     atomic_int n_barrier_passed;
+    atomic_int current_chunk; // currently processing chunk during Mat_Mul, shared between all the threads.
-    lm_ggml_abort_callback abort_callback; // abort lm_ggml_graph_compute when true
-    void * abort_callback_data;
+    // these are atomic as an annotation for thread-sanitizer
+    atomic_bool stop;         // Used for stopping the threadpool altogether
+    atomic_bool pause;        // Used for pausing the threadpool or individual threads
-    atomic_int current_chunk; // currently processing chunk during mul_mat, shared between all the threads
+    struct lm_ggml_compute_state * workers;   // per thread state
+    int          n_threads_max; // number of threads in the pool
+    int          n_threads_cur; // number of threads used in the current graph
+    int32_t      prio;        // Scheduling priority
+    uint32_t     poll;        // Polling level (0 - no polling)
     enum lm_ggml_status ec;
 };
+// Per-thread state
 struct lm_ggml_compute_state {
+#ifndef LM_GGML_USE_OPENMP
     lm_ggml_thread_t thrd;
+    bool cpumask[LM_GGML_MAX_N_THREADS];
+    int  last_graph;
+    bool pending;
+#endif
+    struct lm_ggml_threadpool * threadpool;
     int ith;
-    struct lm_ggml_compute_state_shared * shared;
 };
 struct lm_ggml_compute_params {
@@ -1900,7 +2025,7 @@ struct lm_ggml_compute_params {
     size_t wsize;
     void * wdata;
-    struct lm_ggml_compute_state_shared * shared;
+    struct lm_ggml_threadpool * threadpool;
 };
 //
@@ -2310,7 +2435,9 @@ inline static void lm_ggml_vec_scale_f16(const int n, lm_ggml_fp16_t * y, const
 inline static void lm_ggml_vec_norm_f32 (const int n, float * s, const float * x) { lm_ggml_vec_dot_f32(n, s, 0, x, 0, x, 0, 1); *s = sqrtf(*s);   }
 inline static void lm_ggml_vec_sqr_f32  (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = x[i]*x[i];   }
 inline static void lm_ggml_vec_sqrt_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = sqrtf(x[i]); }
-inline static void lm_ggml_vec_log_f32  (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = logf(x[i]);   }
+inline static void lm_ggml_vec_log_f32  (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = logf(x[i]);  }
+inline static void lm_ggml_vec_sin_f32  (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = sinf(x[i]);  }
+inline static void lm_ggml_vec_cos_f32  (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = cosf(x[i]);  }
 inline static void lm_ggml_vec_abs_f32  (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = fabsf(x[i]); }
 inline static void lm_ggml_vec_sgn_f32  (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? 1.f : ((x[i] < 0.f) ? -1.f : 0.f); }
 inline static void lm_ggml_vec_step_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? 1.f : 0.f; }
@@ -2322,6 +2449,7 @@ inline static void lm_ggml_vec_sigmoid_f32 (const int n, float * y, const float
 // TODO: optimize performance
 inline static void lm_ggml_vec_hardswish_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = x[i] * fminf(1.0f, fmaxf(0.0f, (x[i] + 3.0f) / 6.0f)); }
 inline static void lm_ggml_vec_hardsigmoid_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = fminf(1.0f, fmaxf(0.0f, (x[i] + 3.0f) / 6.0f)); }
+inline static void lm_ggml_vec_exp_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = expf(x[i]); }
 static const float GELU_COEF_A     = 0.044715f;
 static const float GELU_QUICK_COEF = -1.702f;
@@ -2669,6 +2797,19 @@ static lm_ggml_float lm_ggml_vec_soft_max_f32(const int n, float * y, const floa
     return sum;
 }
+static lm_ggml_float lm_ggml_vec_log_soft_max_f32(const int n, float * y, const float * x, float max) {
+    // log(soft_max) = log(soft_max_i / soft_max_sum) = log(soft_max_i) - log(soft_max_sum) = (logit_i - max) - log(soft_max_i)
+    int i = 0;
+    lm_ggml_float sum = 0;
+    for (; i < n; ++i) {
+        float val = x[i] - max;
+        y[i] = val;
+        sum += (lm_ggml_float)expf(val);
+    }
+    return sum = (lm_ggml_float)logf(sum);
+}
 inline static float lm_ggml_silu_backward_f32(float x, float dy) {
     const float s = 1.0f/(1.0f + expf(-x));
     return dy*s*(1.0f + x*(1.0f - s));
@@ -2760,6 +2901,8 @@ static const char * LM_GGML_OP_NAME[LM_GGML_OP_COUNT] = {
     "SQR",
     "SQRT",
     "LOG",
+    "SIN",
+    "COS",
     "SUM",
     "SUM_ROWS",
     "MEAN",
@@ -2797,9 +2940,11 @@ static const char * LM_GGML_OP_NAME[LM_GGML_OP_COUNT] = {
     "CLAMP",
     "CONV_TRANSPOSE_1D",
     "IM2COL",
+    "IM2COL_BACK",
     "CONV_TRANSPOSE_2D",
     "POOL_1D",
     "POOL_2D",
+    "POOL_2D_BACK",
     "UPSCALE",
     "PAD",
     "ARANGE",
@@ -2815,6 +2960,7 @@ static const char * LM_GGML_OP_NAME[LM_GGML_OP_COUNT] = {
     "WIN_UNPART",
     "GET_REL_POS",
     "ADD_REL_POS",
+    "RWKV_WKV",
     "UNARY",
@@ -2833,7 +2979,7 @@ static const char * LM_GGML_OP_NAME[LM_GGML_OP_COUNT] = {
     "CROSS_ENTROPY_LOSS_BACK",
 };
-static_assert(LM_GGML_OP_COUNT == 74, "LM_GGML_OP_COUNT != 74");
+static_assert(LM_GGML_OP_COUNT == 79, "LM_GGML_OP_COUNT != 79");
 static const char * LM_GGML_OP_SYMBOL[LM_GGML_OP_COUNT] = {
     "none",
@@ -2848,6 +2994,8 @@ static const char * LM_GGML_OP_SYMBOL[LM_GGML_OP_COUNT] = {
     "x^2",
     "√x",
     "log(x)",
+    "sin(x)",
+    "cos(x)",
     "Σx",
     "Σx_k",
     "Σx/n",
@@ -2885,9 +3033,11 @@ static const char * LM_GGML_OP_SYMBOL[LM_GGML_OP_COUNT] = {
     "clamp(x)",
     "conv_transpose_1d(x)",
     "im2col(x)",
+    "im2col_back(x)",
     "conv_transpose_2d(x)",
     "pool_1d(x)",
     "pool_2d(x)",
+    "pool_2d_back(x)",
     "upscale(x)",
     "pad(x)",
     "arange(start, stop, step)",
@@ -2903,6 +3053,7 @@ static const char * LM_GGML_OP_SYMBOL[LM_GGML_OP_COUNT] = {
     "win_unpart(x)",
     "get_rel_pos(x)",
     "add_rel_pos(x)",
+    "rwkv_wkv(k, v, r, tf, td, s)",
     "unary(x)",
@@ -2921,7 +3072,7 @@ static const char * LM_GGML_OP_SYMBOL[LM_GGML_OP_COUNT] = {
     "cross_entropy_loss_back(x,y)",
 };
-static_assert(LM_GGML_OP_COUNT == 74, "LM_GGML_OP_COUNT != 74");
+static_assert(LM_GGML_OP_COUNT == 79, "LM_GGML_OP_COUNT != 79");
 static_assert(LM_GGML_OP_POOL_COUNT == 2, "LM_GGML_OP_POOL_COUNT != 2");
@@ -2940,14 +3091,28 @@ static const char * LM_GGML_UNARY_OP_NAME[LM_GGML_UNARY_OP_COUNT] = {
     "SILU",
     "HARDSWISH",
     "HARDSIGMOID",
+    "EXP",
 };
-static_assert(LM_GGML_UNARY_OP_COUNT == 13, "LM_GGML_UNARY_OP_COUNT != 13");
+static_assert(LM_GGML_UNARY_OP_COUNT == 14, "LM_GGML_UNARY_OP_COUNT != 14");
 static_assert(sizeof(struct lm_ggml_object)%LM_GGML_MEM_ALIGN == 0, "lm_ggml_object size must be a multiple of LM_GGML_MEM_ALIGN");
 static_assert(sizeof(struct lm_ggml_tensor)%LM_GGML_MEM_ALIGN == 0, "lm_ggml_tensor size must be a multiple of LM_GGML_MEM_ALIGN");
+// Helpers for polling loops
+#if defined(__aarch64__) && ( defined(__clang__) || defined(__GNUC__) )
+static inline void lm_ggml_thread_cpu_relax(void) {
+    __asm__ volatile("yield" ::: "memory");
+}
+#elif defined(__x86_64__)
+static inline void lm_ggml_thread_cpu_relax(void) {
+    _mm_pause();
+}
+#else
+static inline void lm_ggml_thread_cpu_relax(void) {;}
+#endif
 //
 // NUMA support
 //
@@ -2995,42 +3160,36 @@ inline static void lm_ggml_critical_section_start(void) {
 }
 #ifdef LM_GGML_USE_OPENMP
-static void lm_ggml_barrier(struct lm_ggml_compute_state_shared * shared) {
-    if (shared->n_threads == 1) {
+static void lm_ggml_barrier(struct lm_ggml_threadpool * threadpool) {
+    if (threadpool->n_threads_cur == 1) {
         return;
     }
     #pragma omp barrier
 }
 #else
-static void lm_ggml_barrier(struct lm_ggml_compute_state_shared * shared) {
-    if (shared->n_threads == 1) {
+static void lm_ggml_barrier(struct lm_ggml_threadpool * threadpool) {
+    if (threadpool->n_threads_cur == 1) {
         return;
     }
-    atomic_int * n_barrier = &shared->n_barrier;
-    atomic_int * n_barrier_passed = &shared->n_barrier_passed;
+    atomic_int * n_barrier = &threadpool->n_barrier;
+    atomic_int * n_barrier_passed = &threadpool->n_barrier_passed;
-    int n_threads = shared->n_threads;
-    int passed_old = atomic_load(n_barrier_passed);
+    int n_threads = threadpool->n_threads_cur;
+    int passed_old = atomic_load_explicit(n_barrier_passed, memory_order_relaxed);
     if (atomic_fetch_add(n_barrier, 1) == n_threads - 1) {
         // last thread
         atomic_store(n_barrier, 0);
-        atomic_fetch_add(n_barrier_passed, 1);
+        atomic_fetch_add_explicit(n_barrier_passed, 1, memory_order_relaxed);
     } else {
         // wait for other threads
-        const int n_spin_before_sleep = 100000;
         while (true) {
-            for (int i = 0; i < n_spin_before_sleep; i++) {
-                if (atomic_load(n_barrier_passed) != passed_old) {
-                    return;
-                }
-            #if defined(__SSE3__)
-                _mm_pause();
-            #endif
+            if (atomic_load_explicit(n_barrier_passed, memory_order_relaxed) != passed_old) {
+                return;
             }
-            sched_yield();
+            lm_ggml_thread_cpu_relax();
         }
     }
 }
@@ -3767,6 +3926,7 @@ static struct lm_ggml_tensor * lm_ggml_new_tensor_impl(
     }
     struct lm_ggml_object * const obj_new = lm_ggml_new_object(ctx, LM_GGML_OBJECT_TYPE_TENSOR, LM_GGML_TENSOR_SIZE + obj_alloc_size);
+    LM_GGML_ASSERT(obj_new);
     // TODO: for recoverable errors, we would need to free the data allocated from the scratch buffer here
@@ -4486,8 +4646,6 @@ static struct lm_ggml_tensor * lm_ggml_add_impl(
     bool is_node = false;
     if (!inplace && (a->grad || b->grad)) {
-        // TODO: support backward pass for broadcasting
-        LM_GGML_ASSERT(lm_ggml_are_same_shape(a, b));
         is_node = true;
     }
@@ -4661,11 +4819,13 @@ static struct lm_ggml_tensor * lm_ggml_sub_impl(
         struct lm_ggml_tensor * a,
         struct lm_ggml_tensor * b,
         bool inplace) {
-    LM_GGML_ASSERT(lm_ggml_are_same_shape(a, b));
+    LM_GGML_ASSERT(lm_ggml_can_repeat(b, a));
     bool is_node = false;
     if (!inplace && (a->grad || b->grad)) {
+        // TODO: support backward pass for broadcasting
+        LM_GGML_ASSERT(lm_ggml_are_same_shape(a, b));
         is_node = true;
     }
@@ -4880,6 +5040,72 @@ struct lm_ggml_tensor * lm_ggml_log_inplace(
     return lm_ggml_log_impl(ctx, a, true);
 }
+// lm_ggml_sin
+static struct lm_ggml_tensor * lm_ggml_sin_impl(
+        struct lm_ggml_context * ctx,
+        struct lm_ggml_tensor  * a,
+        bool inplace) {
+    bool is_node = false;
+    if (!inplace && (a->grad)) {
+        is_node = true;
+    }
+    struct lm_ggml_tensor * result = inplace ? lm_ggml_view_tensor(ctx, a) : lm_ggml_dup_tensor(ctx, a);
+    result->op   = LM_GGML_OP_SIN;
+    result->grad = is_node ? lm_ggml_dup_tensor(ctx, result) : NULL;
+    result->src[0] = a;
+    return result;
+}
+struct lm_ggml_tensor * lm_ggml_sin(
+        struct lm_ggml_context * ctx,
+        struct lm_ggml_tensor  * a) {
+    return lm_ggml_sin_impl(ctx, a, false);
+}
+struct lm_ggml_tensor * lm_ggml_sin_inplace(
+        struct lm_ggml_context * ctx,
+        struct lm_ggml_tensor  * a) {
+    return lm_ggml_sin_impl(ctx, a, true);
+}
+// lm_ggml_cos
+static struct lm_ggml_tensor * lm_ggml_cos_impl(
+        struct lm_ggml_context * ctx,
+        struct lm_ggml_tensor  * a,
+        bool inplace) {
+    bool is_node = false;
+    if (!inplace && (a->grad)) {
+        is_node = true;
+    }
+    struct lm_ggml_tensor * result = inplace ? lm_ggml_view_tensor(ctx, a) : lm_ggml_dup_tensor(ctx, a);
+    result->op   = LM_GGML_OP_COS;
+    result->grad = is_node ? lm_ggml_dup_tensor(ctx, result) : NULL;
+    result->src[0] = a;
+    return result;
+}
+struct lm_ggml_tensor * lm_ggml_cos(
+        struct lm_ggml_context * ctx,
+        struct lm_ggml_tensor  * a) {
+    return lm_ggml_cos_impl(ctx, a, false);
+}
+struct lm_ggml_tensor * lm_ggml_cos_inplace(
+        struct lm_ggml_context * ctx,
+        struct lm_ggml_tensor  * a) {
+    return lm_ggml_cos_impl(ctx, a, true);
+}
 // lm_ggml_sum
 struct lm_ggml_tensor * lm_ggml_sum(
@@ -5041,6 +5267,7 @@ struct lm_ggml_tensor * lm_ggml_concat(
     bool is_node = false;
     if (a->grad || b->grad) {
+        LM_GGML_ABORT("fatal error"); // TODO: implement
         is_node = true;
     }
@@ -5162,6 +5389,7 @@ struct lm_ggml_tensor * lm_ggml_leaky_relu(
     bool is_node = false;
     if (!inplace && (a->grad)) {
+        LM_GGML_ABORT("fatal error"); // TODO: not implemented
         is_node = true;
     }
@@ -5269,6 +5497,19 @@ struct lm_ggml_tensor * lm_ggml_hardsigmoid(
     return lm_ggml_unary(ctx, a, LM_GGML_UNARY_OP_HARDSIGMOID);
 }
+// ggml exp
+struct lm_ggml_tensor * lm_ggml_exp(
+        struct lm_ggml_context * ctx,
+        struct lm_ggml_tensor  * a) {
+    return lm_ggml_unary(ctx, a, LM_GGML_UNARY_OP_EXP);
+}
+struct lm_ggml_tensor * lm_ggml_exp_inplace(
+        struct lm_ggml_context * ctx,
+        struct lm_ggml_tensor  * a) {
+    return lm_ggml_unary_inplace(ctx, a, LM_GGML_UNARY_OP_EXP);
+}
 // lm_ggml_norm
 static struct lm_ggml_tensor * lm_ggml_norm_impl(
@@ -5587,6 +5828,7 @@ static struct lm_ggml_tensor * lm_ggml_set_impl(
     // make a view of the destination
     struct lm_ggml_tensor * result = inplace ? lm_ggml_view_tensor(ctx, a) : lm_ggml_dup_tensor(ctx, a);
+    LM_GGML_ASSERT(offset < (size_t)(1 << 30));
     int32_t params[] = { nb1, nb2, nb3, offset, inplace ? 1 : 0 };
     lm_ggml_set_op_params(result, params, sizeof(params));
@@ -6544,14 +6786,12 @@ struct lm_ggml_tensor * lm_ggml_rope_back(
     LM_GGML_ASSERT(lm_ggml_is_vector(b));
     LM_GGML_ASSERT(b->type == LM_GGML_TYPE_I32);
     LM_GGML_ASSERT(a->ne[2] == b->ne[0]);
-    LM_GGML_ASSERT(c == NULL && "freq factors not implemented yet");
-    LM_GGML_ASSERT((mode & 4) == 0 && "lm_ggml_rope_back() for ChatGLM not implemented yet");
     bool is_node = false;
     if (a->grad) {
-        is_node = false; // TODO: implement backward
+        LM_GGML_ASSERT(false && "backwards pass not implemented");
+        is_node = false;
     }
     struct lm_ggml_tensor * result = lm_ggml_dup_tensor(ctx, a);
@@ -6569,6 +6809,7 @@ struct lm_ggml_tensor * lm_ggml_rope_back(
     result->grad = is_node ? lm_ggml_dup_tensor(ctx, result) : NULL;
     result->src[0] = a;
     result->src[1] = b;
+    result->src[2] = c;
     return result;
 }
@@ -6727,17 +6968,20 @@ struct lm_ggml_tensor * lm_ggml_im2col(
         LM_GGML_ASSERT(a->ne[2] == b->ne[2]);
     } else {
         LM_GGML_ASSERT(a->ne[1] == b->ne[1]);
+        LM_GGML_ASSERT(b->ne[3] == 1);
     }
     bool is_node = false;
-    if (a->grad || b->grad) {
-        LM_GGML_ABORT("fatal error"); // TODO: implement backward
+    if (/*a->grad ||*/ b->grad) { // a is only used for its shape, not its data
         is_node = true;
     }
     const int64_t OH = is_2D ? lm_ggml_calc_conv_output_size(b->ne[1], a->ne[1], s1, p1, d1) : 0;
     const int64_t OW =         lm_ggml_calc_conv_output_size(b->ne[0], a->ne[0], s0, p0, d0);
+    LM_GGML_ASSERT((!is_2D || OH > 0) && "b too small compared to a");
+    LM_GGML_ASSERT((OW > 0)           && "b too small compared to a");
     const int64_t ne[4] = {
         is_2D ? (a->ne[2] * a->ne[1] * a->ne[0]) : a->ne[1] * a->ne[0],
         OW,
@@ -6757,6 +7001,37 @@ struct lm_ggml_tensor * lm_ggml_im2col(
     return result;
 }
+struct lm_ggml_tensor * lm_ggml_im2col_back(
+    struct lm_ggml_context * ctx,
+    struct lm_ggml_tensor  * a,
+    struct lm_ggml_tensor  * b,
+    int64_t             * ne,
+    int                   s0,
+    int                   s1,
+    int                   p0,
+    int                   p1,
+    int                   d0,
+    int                   d1,
+    bool                  is_2D) {
+    bool is_node = false;
+    if (/*a->grad ||*/ b->grad) { // a is only used for its shape, not its data
+        is_node = true;
+    }
+    struct lm_ggml_tensor * result = lm_ggml_new_tensor(ctx, LM_GGML_TYPE_F32, 4, ne);
+    int32_t params[] = { s0, s1, p0, p1, d0, d1, (is_2D ? 1 : 0) };
+    lm_ggml_set_op_params(result, params, sizeof(params));
+    result->op = LM_GGML_OP_IM2COL_BACK;
+    result->grad = is_node ? lm_ggml_dup_tensor(ctx, result) : NULL;
+    result->src[0] = a;
+    result->src[1] = b;
+    return result;
+}
 // a: [OC，IC, KH, KW]
 // b: [N, IC, IH, IW]
 // result: [N, OC, OH, OW]
@@ -6770,7 +7045,7 @@ struct lm_ggml_tensor * lm_ggml_conv_2d(
         int                  p1,
         int                  d0,
         int                  d1) {
-    struct lm_ggml_tensor * im2col = lm_ggml_im2col(ctx, a, b, s0, s1, p0, p1, d0, d1, true, LM_GGML_TYPE_F16); // [N, OH, OW, IC * KH * KW]
+    struct lm_ggml_tensor * im2col = lm_ggml_im2col(ctx, a, b, s0, s1, p0, p1, d0, d1, true, a->type); // [N, OH, OW, IC * KH * KW]
     struct lm_ggml_tensor * result =
         lm_ggml_mul_mat(ctx,
@@ -6896,17 +7171,17 @@ struct lm_ggml_tensor * lm_ggml_pool_2d(
     bool is_node = false;
     if (a->grad) {
-        LM_GGML_ABORT("fatal error"); // TODO: implement backward
         is_node = true;
     }
     struct lm_ggml_tensor * result;
-    const int64_t ne[3] = {
+    const int64_t ne[4] = {
         lm_ggml_calc_pool_output_size(a->ne[0], k0, s0, p0),
         lm_ggml_calc_pool_output_size(a->ne[1], k1, s1, p1),
         a->ne[2],
+        a->ne[3],
     };
-    result = lm_ggml_new_tensor(ctx, LM_GGML_TYPE_F32, 3, ne);
+    result = lm_ggml_new_tensor(ctx, LM_GGML_TYPE_F32, 4, ne);
     int32_t params[] = { op, k0, k1, s0, s1, p0, p1 };
     lm_ggml_set_op_params(result, params, sizeof(params));
@@ -6917,6 +7192,37 @@ struct lm_ggml_tensor * lm_ggml_pool_2d(
     return result;
 }
+struct lm_ggml_tensor * lm_ggml_pool_2d_back(
+        struct lm_ggml_context * ctx,
+        struct lm_ggml_tensor  * a,
+        struct lm_ggml_tensor  * af,
+        enum lm_ggml_op_pool     op,
+        int                   k0,
+        int                   k1,
+        int                   s0,
+        int                   s1,
+        float                 p0,
+        float                 p1) {
+    bool is_node = false;
+    if (a->grad) {
+        is_node = true;
+    }
+    struct lm_ggml_tensor * result;
+    result = lm_ggml_new_tensor(ctx, LM_GGML_TYPE_F32, 4, af->ne);
+    int32_t params[] = { op, k0, k1, s0, s1, p0, p1 };
+    lm_ggml_set_op_params(result, params, sizeof(params));
+    result->op = LM_GGML_OP_POOL_2D_BACK;
+    result->grad = is_node ? lm_ggml_dup_tensor(ctx, result) : NULL;
+    result->src[0] = a;
+    result->src[1] = af;
+    return result;
+}
 // lm_ggml_upscale
 static struct lm_ggml_tensor * lm_ggml_upscale_impl(
@@ -7057,6 +7363,11 @@ struct lm_ggml_tensor * lm_ggml_argsort(
         enum lm_ggml_sort_order  order) {
     bool is_node = false;
+    if (a->grad) {
+        LM_GGML_ABORT("fatal error"); // TODO: not implemented
+        is_node = true;
+    }
     struct lm_ggml_tensor * result = lm_ggml_new_tensor(ctx, LM_GGML_TYPE_I32, LM_GGML_MAX_DIMS, a->ne);
     lm_ggml_set_op_params_i32(result, 0, (int32_t) order);
@@ -7467,6 +7778,59 @@ struct lm_ggml_tensor * lm_ggml_add_rel_pos_inplace(
     return lm_ggml_add_rel_pos_impl(ctx, a, pw, ph, true);
 }
+// lm_ggml_rwkv_wkv
+struct lm_ggml_tensor * lm_ggml_rwkv_wkv(
+        struct lm_ggml_context * ctx,
+        struct lm_ggml_tensor * k,
+        struct lm_ggml_tensor * v,
+        struct lm_ggml_tensor * r,
+        struct lm_ggml_tensor * tf,
+        struct lm_ggml_tensor * td,
+        struct lm_ggml_tensor * state) {
+    LM_GGML_ASSERT(lm_ggml_is_contiguous(k));
+    LM_GGML_ASSERT(lm_ggml_is_contiguous(v));
+    LM_GGML_ASSERT(lm_ggml_is_contiguous(r));
+    LM_GGML_ASSERT(lm_ggml_is_contiguous(tf));
+    LM_GGML_ASSERT(lm_ggml_is_contiguous(td));
+    LM_GGML_ASSERT(lm_ggml_is_contiguous(state));
+    const int64_t S = k->ne[0];
+    const int64_t H = k->ne[2];
+    const int64_t n_tokens = k->ne[3];
+    const int64_t n_seqs = state->ne[1];
+    {
+        LM_GGML_ASSERT(k->ne[1] == 1);
+        LM_GGML_ASSERT(v->ne[0] == 1 && v->ne[1] == S && v->ne[2] == H && v->ne[3] == n_tokens);
+        LM_GGML_ASSERT(r->ne[0] == 1 && r->ne[1] == S && r->ne[2] == H && r->ne[3] == n_tokens);
+        // TODO: RWKV v4 and v5
+        LM_GGML_ASSERT(td->ne[0] == 1 && td->ne[1] == S && td->ne[2] == H && td->ne[3] == n_tokens);
+        LM_GGML_ASSERT(lm_ggml_nelements(state) == S * S * H * n_seqs);
+    }
+    bool is_node = false;
+    if (k->grad || v->grad || r->grad || tf->grad || td->grad || state->grad) {
+        LM_GGML_ABORT("fatal error"); // TODO: implement backward
+        is_node = true;
+    }
+    // concat output and new_state
+    const int64_t ne[4] = { S * H, n_tokens + S * n_seqs, 1, 1 };
+    struct lm_ggml_tensor * result = lm_ggml_new_tensor(ctx, LM_GGML_TYPE_F32, 4, ne);
+    result->op   = LM_GGML_OP_RWKV_WKV;
+    result->grad = is_node ? lm_ggml_dup_tensor(ctx, result) : NULL;
+    result->src[0] = k;
+    result->src[1] = v;
+    result->src[2] = r;
+    result->src[3] = tf;
+    result->src[4] = td;
+    result->src[5] = state;
+    return result;
+}
 // lm_ggml_unary
 static struct lm_ggml_tensor * lm_ggml_unary_impl(
@@ -7965,8 +8329,7 @@ static void lm_ggml_compute_forward_dup_same_cont(
     LM_GGML_ASSERT(lm_ggml_is_contiguous(dst) && lm_ggml_is_contiguous(src0));
     LM_GGML_ASSERT(src0->type == dst->type);
-    const size_t nb00 = src0->nb[0];
-    const size_t nb0 = dst->nb[0];
+    const size_t nb0 = lm_ggml_type_size(src0->type);
     const int ith = params->ith; // thread index
     const int nth = params->nth; // number of threads
@@ -7980,8 +8343,8 @@ static void lm_ggml_compute_forward_dup_same_cont(
     if (ie0 < ie1) {
         memcpy(
             ((char *)  dst->data + ie0*nb0),
-            ((char *) src0->data + ie0*nb00),
-            (ie1 - ie0) * lm_ggml_type_size(src0->type));
+            ((char *) src0->data + ie0*nb0),
+            (ie1 - ie0) * nb0);
     }
 }
@@ -7998,11 +8361,6 @@ static void lm_ggml_compute_forward_dup_f16(
     const int ith = params->ith; // thread index
     const int nth = params->nth; // number of threads
-    if (lm_ggml_is_contiguous(src0) && lm_ggml_is_contiguous(dst) && src0->type == dst->type) {
-        lm_ggml_compute_forward_dup_same_cont(params, dst);
-        return;
-    }
     // parallelize by rows
     const int nr = ne01;
     // number of rows per thread
@@ -8267,11 +8625,6 @@ static void lm_ggml_compute_forward_dup_bf16(
     const int ith = params->ith; // thread index
     const int nth = params->nth; // number of threads
-    if (lm_ggml_is_contiguous(src0) && lm_ggml_is_contiguous(dst) && src0->type == dst->type) {
-        lm_ggml_compute_forward_dup_same_cont(params, dst);
-        return;
-    }
     // parallelize by rows
     const int nr = ne01;
     // number of rows per thread
@@ -8623,11 +8976,6 @@ static void lm_ggml_compute_forward_dup_f32(
     const int ith = params->ith; // thread index
     const int nth = params->nth; // number of threads
-    if (lm_ggml_is_contiguous(src0) && lm_ggml_is_contiguous(dst) && src0->type == dst->type) {
-        lm_ggml_compute_forward_dup_same_cont(params, dst);
-        return;
-    }
     // parallelize by rows
     const int nr = ne01;
     // number of rows per thread
@@ -8937,13 +9285,13 @@ static void lm_ggml_compute_forward_dup_bytes(
     LM_GGML_ASSERT(lm_ggml_nelements(dst) == lm_ggml_nelements(src0));
     LM_GGML_ASSERT(src0->type == dst->type);
+    LM_GGML_TENSOR_UNARY_OP_LOCALS;
     if (lm_ggml_is_contiguous(src0) && lm_ggml_is_contiguous(dst)) {
         lm_ggml_compute_forward_dup_same_cont(params, dst);
         return;
     }
-    LM_GGML_TENSOR_UNARY_OP_LOCALS;
     const size_t type_size = lm_ggml_type_size(src0->type);
     const int ith = params->ith; // thread index
     const int nth = params->nth; // number of threads
@@ -9564,6 +9912,8 @@ static void lm_ggml_compute_forward_add(
         case LM_GGML_TYPE_Q4_K:
         case LM_GGML_TYPE_Q5_K:
         case LM_GGML_TYPE_Q6_K:
+        case LM_GGML_TYPE_TQ1_0:
+        case LM_GGML_TYPE_TQ2_0:
         case LM_GGML_TYPE_IQ2_XXS:
         case LM_GGML_TYPE_IQ2_XS:
         case LM_GGML_TYPE_IQ3_XXS:
@@ -9942,6 +10292,8 @@ static void lm_ggml_compute_forward_add1(
         case LM_GGML_TYPE_Q4_K:
         case LM_GGML_TYPE_Q5_K:
         case LM_GGML_TYPE_Q6_K:
+        case LM_GGML_TYPE_TQ1_0:
+        case LM_GGML_TYPE_TQ2_0:
         case LM_GGML_TYPE_IQ2_XXS:
         case LM_GGML_TYPE_IQ2_XS:
         case LM_GGML_TYPE_IQ3_XXS:
@@ -9993,7 +10345,7 @@ static void lm_ggml_compute_forward_acc_f32(
                 ((char *) src0->data),
                 lm_ggml_nbytes(dst));
         }
-        lm_ggml_barrier(params->shared);
+        lm_ggml_barrier(params->threadpool);
     }
     const int ith = params->ith;
@@ -10070,6 +10422,8 @@ static void lm_ggml_compute_forward_acc(
         case LM_GGML_TYPE_Q4_K:
         case LM_GGML_TYPE_Q5_K:
         case LM_GGML_TYPE_Q6_K:
+        case LM_GGML_TYPE_TQ1_0:
+        case LM_GGML_TYPE_TQ2_0:
         case LM_GGML_TYPE_IQ2_XXS:
         case LM_GGML_TYPE_IQ2_XS:
         case LM_GGML_TYPE_IQ3_XXS:
@@ -10098,11 +10452,10 @@ static void lm_ggml_compute_forward_sub_f32(
     const struct lm_ggml_tensor * src0 = dst->src[0];
     const struct lm_ggml_tensor * src1 = dst->src[1];
-    if (params->ith != 0) {
-        return;
-    }
+    assert(lm_ggml_can_repeat(src1, src0) && lm_ggml_are_same_shape(src0, dst));
-    assert(lm_ggml_are_same_shape(src0, src1) && lm_ggml_are_same_shape(src0, dst));
+    const int ith = params->ith;
+    const int nth = params->nth;
     const int nr  = lm_ggml_nrows(src0);
@@ -10111,40 +10464,55 @@ static void lm_ggml_compute_forward_sub_f32(
     LM_GGML_ASSERT( nb0 == sizeof(float));
     LM_GGML_ASSERT(nb00 == sizeof(float));
+    // rows per thread
+    const int dr = (nr + nth - 1)/nth;
+    // row range for this thread
+    const int ir0 = dr*ith;
+    const int ir1 = MIN(ir0 + dr, nr);
     if (nb10 == sizeof(float)) {
-        for (int ir = 0; ir < nr; ++ir) {
-            // src0, src1 and dst are same shape => same indices
-            const int i3 = ir/(ne2*ne1);
-            const int i2 = (ir - i3*ne2*ne1)/ne1;
-            const int i1 = (ir - i3*ne2*ne1 - i2*ne1);
+        for (int ir = ir0; ir < ir1; ++ir) {
+            // src1 is broadcastable across src0 and dst in i1, i2, i3
+            const int64_t i03 = ir/(ne02*ne01);
+            const int64_t i02 = (ir - i03*ne02*ne01)/ne01;
+            const int64_t i01 = (ir - i03*ne02*ne01 - i02*ne01);
-#ifdef LM_GGML_USE_ACCELERATE
-            vDSP_vsub(
-                    (float *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11), 1,
-                    (float *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01), 1,
-                    (float *) ((char *) dst->data  + i3*nb3  + i2*nb2  + i1*nb1 ), 1,
-                    ne0);
-#else
-            lm_ggml_vec_sub_f32(ne0,
-                    (float *) ((char *) dst->data  + i3*nb3  + i2*nb2  + i1*nb1 ),
-                    (float *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01),
-                    (float *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11));
+            const int64_t i13 = i03 % ne13;
+            const int64_t i12 = i02 % ne12;
+            const int64_t i11 = i01 % ne11;
+            const int64_t nr0 = ne00 / ne10;
+            float * dst_ptr  = (float *) ((char *) dst->data  + i03*nb3  + i02*nb2  + i01*nb1 );
+            float * src0_ptr = (float *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01);
+            float * src1_ptr = (float *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11);
+            for (int64_t r = 0; r < nr0; ++r) {
+#ifdef LM_GGML_USE_ACCELERATE
+                vDSP_vsub(src1_ptr, 1, src0_ptr + r*ne10, 1, dst_ptr + r*ne10, 1, ne10);
+#else
+                lm_ggml_vec_sub_f32(ne10, dst_ptr + r*ne10, src0_ptr + r*ne10, src1_ptr);
 #endif
-                // }
-            // }
+            }
         }
     } else {
         // src1 is not contiguous
-        for (int ir = 0; ir < nr; ++ir) {
-            // src0, src1 and dst are same shape => same indices
-            const int i3 = ir/(ne2*ne1);
-            const int i2 = (ir - i3*ne2*ne1)/ne1;
-            const int i1 = (ir - i3*ne2*ne1 - i2*ne1);
+        for (int ir = ir0; ir < ir1; ++ir) {
+            // src1 is broadcastable across src0 and dst in i1, i2, i3
+            const int64_t i03 = ir/(ne02*ne01);
+            const int64_t i02 = (ir - i03*ne02*ne01)/ne01;
+            const int64_t i01 = (ir - i03*ne02*ne01 - i02*ne01);
+            const int64_t i13 = i03 % ne13;
+            const int64_t i12 = i02 % ne12;
+            const int64_t i11 = i01 % ne11;
+            float * dst_ptr  = (float *) ((char *) dst->data  + i03*nb3  + i02*nb2  + i01*nb1 );
+            float * src0_ptr = (float *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01);
-            float * dst_ptr  = (float *) ((char *) dst->data  + i3*nb3  + i2*nb2  + i1*nb1 );
-            float * src0_ptr = (float *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01);
-            for (int i0 = 0; i0 < ne0; i0++) {
-                float * src1_ptr = (float *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11 + i0*nb10);
+            for (int64_t i0 = 0; i0 < ne0; ++i0) {
+                const int64_t i10 = i0 % ne10;
+                float * src1_ptr = (float *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11 + i10*nb10);
                 dst_ptr[i0] = src0_ptr[i0] - *src1_ptr;
             }
@@ -10490,9 +10858,9 @@ static void lm_ggml_compute_forward_log(
     }
 }
-// lm_ggml_compute_forward_sum
+// lm_ggml_compute_forward_sin
-static void lm_ggml_compute_forward_sum_f32(
+static void lm_ggml_compute_forward_sin_f32(
         const struct lm_ggml_compute_params * params,
         struct lm_ggml_tensor * dst) {
@@ -10502,8 +10870,95 @@ static void lm_ggml_compute_forward_sum_f32(
         return;
     }
-    assert(lm_ggml_is_scalar(dst));
+    LM_GGML_ASSERT(lm_ggml_are_same_shape(src0, dst));
+    const int n  = lm_ggml_nrows(src0);
+    const int nc = src0->ne[0];
+    LM_GGML_ASSERT( dst->nb[0] == sizeof(float));
+    LM_GGML_ASSERT(src0->nb[0] == sizeof(float));
+    for (int i = 0; i < n; i++) {
+        lm_ggml_vec_sin_f32(nc,
+                (float *) ((char *) dst->data  + i*( dst->nb[1])),
+                (float *) ((char *) src0->data + i*(src0->nb[1])));
+    }
+}
+static void lm_ggml_compute_forward_sin(
+        const struct lm_ggml_compute_params * params,
+        struct lm_ggml_tensor * dst) {
+    const struct lm_ggml_tensor * src0 = dst->src[0];
+    switch (src0->type) {
+        case LM_GGML_TYPE_F32:
+            {
+                lm_ggml_compute_forward_sin_f32(params, dst);
+            } break;
+        default:
+            {
+                LM_GGML_ABORT("fatal error");
+            }
+    }
+}
+// lm_ggml_compute_forward_cos
+static void lm_ggml_compute_forward_cos_f32(
+        const struct lm_ggml_compute_params * params,
+        struct lm_ggml_tensor * dst) {
+    const struct lm_ggml_tensor * src0 = dst->src[0];
+    if (params->ith != 0) {
+        return;
+    }
+    LM_GGML_ASSERT(lm_ggml_are_same_shape(src0, dst));
+    const int n  = lm_ggml_nrows(src0);
+    const int nc = src0->ne[0];
+    LM_GGML_ASSERT( dst->nb[0] == sizeof(float));
+    LM_GGML_ASSERT(src0->nb[0] == sizeof(float));
+    for (int i = 0; i < n; i++) {
+        lm_ggml_vec_cos_f32(nc,
+                (float *) ((char *) dst->data  + i*( dst->nb[1])),
+                (float *) ((char *) src0->data + i*(src0->nb[1])));
+    }
+}
+static void lm_ggml_compute_forward_cos(
+        const struct lm_ggml_compute_params * params,
+        struct lm_ggml_tensor * dst) {
+    const struct lm_ggml_tensor * src0 = dst->src[0];
+    switch (src0->type) {
+        case LM_GGML_TYPE_F32:
+            {
+                lm_ggml_compute_forward_cos_f32(params, dst);
+            } break;
+        default:
+            {
+                LM_GGML_ABORT("fatal error");
+            }
+    }
+}
+// lm_ggml_compute_forward_sum
+static void lm_ggml_compute_forward_sum_f32(
+        const struct lm_ggml_compute_params * params,
+        struct lm_ggml_tensor * dst) {
+    const struct lm_ggml_tensor * src0 = dst->src[0];
+    if (params->ith != 0) {
+        return;
+    }
     assert(lm_ggml_is_scalar(dst));
     assert(src0->nb[0] == sizeof(float));
@@ -11762,6 +12217,48 @@ static void lm_ggml_compute_forward_hardsigmoid(
     }
 }
+static void lm_ggml_compute_forward_exp_f32(
+        const struct lm_ggml_compute_params * params,
+        struct lm_ggml_tensor * dst) {
+    const struct lm_ggml_tensor * src0 = dst->src[0];
+    if (params->ith != 0) {
+        return;
+    }
+    assert(lm_ggml_is_contiguous_1(src0));
+    assert(lm_ggml_is_contiguous_1(dst));
+    assert(lm_ggml_are_same_shape(src0, dst));
+    const int n  = lm_ggml_nrows(src0);
+    const int nc = src0->ne[0];
+    for (int i = 0; i < n; i++) {
+        lm_ggml_vec_exp_f32(nc,
+                (float *) ((char *) dst->data  + i*( dst->nb[1])),
+                (float *) ((char *) src0->data + i*(src0->nb[1])));
+    }
+}
+static void lm_ggml_compute_forward_exp(
+        const struct lm_ggml_compute_params * params,
+        struct lm_ggml_tensor * dst) {
+    const struct lm_ggml_tensor * src0 = dst->src[0];
+    switch (src0->type) {
+        case LM_GGML_TYPE_F32:
+            {
+                lm_ggml_compute_forward_exp_f32(params, dst);
+            } break;
+        default:
+            {
+                LM_GGML_ABORT("fatal error");
+            }
+    }
+}
 // lm_ggml_compute_forward_norm
@@ -12363,10 +12860,10 @@ UseGgmlGemm1:;
     if (ith == 0) {
         // Every thread starts at ith, so the first unprocessed chunk is nth.  This save a bit of coordination right at the start.
-        atomic_store(&params->shared->current_chunk, nth);
+        atomic_store_explicit(&params->threadpool->current_chunk, nth, memory_order_relaxed);
     }
-    lm_ggml_barrier(params->shared);
+    lm_ggml_barrier(params->threadpool);
 #if LM_GGML_USE_LLAMAFILE
     if (src1->type != vec_dot_type) {
@@ -12474,7 +12971,7 @@ UseGgmlGemm2:;
             break;
         }
-        current_chunk = atomic_fetch_add(&params->shared->current_chunk, 1);
+        current_chunk = atomic_fetch_add_explicit(&params->threadpool->current_chunk, 1, memory_order_relaxed);
     }
 }
@@ -12569,7 +13066,7 @@ static void lm_ggml_compute_forward_mul_mat_id(
         }
     }
-    lm_ggml_barrier(params->shared);
+    lm_ggml_barrier(params->threadpool);
     // compute each matrix multiplication in sequence
     for (int cur_a = 0; cur_a < n_as; ++cur_a) {
@@ -12723,7 +13220,7 @@ static void lm_ggml_compute_forward_out_prod_f32(
     if (ith == 0) {
         lm_ggml_vec_set_f32(ne0*ne1*ne2*ne3, dst->data, 0);
     }
-    lm_ggml_barrier(params->shared);
+    lm_ggml_barrier(params->threadpool);
     // dst[:,:,:,:] = 0
     // for i2,i3:
@@ -12841,7 +13338,7 @@ static void lm_ggml_compute_forward_out_prod_q_f32(
     if (ith == 0) {
         lm_ggml_vec_set_f32(ne0*ne1*ne2*ne3, dst->data, 0);
     }
-    lm_ggml_barrier(params->shared);
+    lm_ggml_barrier(params->threadpool);
     // parallelize by last three dimensions
@@ -12907,6 +13404,8 @@ static void lm_ggml_compute_forward_out_prod(
         case LM_GGML_TYPE_Q4_K:
         case LM_GGML_TYPE_Q5_K:
         case LM_GGML_TYPE_Q6_K:
+        case LM_GGML_TYPE_TQ1_0:
+        case LM_GGML_TYPE_TQ2_0:
         case LM_GGML_TYPE_IQ2_XXS:
         case LM_GGML_TYPE_IQ2_XS:
         case LM_GGML_TYPE_IQ3_XXS:
@@ -13027,7 +13526,7 @@ static void lm_ggml_compute_forward_set_f32(
                 ((char *) src0->data),
                 lm_ggml_nbytes(dst));
         }
-        lm_ggml_barrier(params->shared);
+        lm_ggml_barrier(params->threadpool);
     }
     const int ith = params->ith;
@@ -13095,6 +13594,8 @@ static void lm_ggml_compute_forward_set(
         case LM_GGML_TYPE_Q4_K:
         case LM_GGML_TYPE_Q5_K:
         case LM_GGML_TYPE_Q6_K:
+        case LM_GGML_TYPE_TQ1_0:
+        case LM_GGML_TYPE_TQ2_0:
         case LM_GGML_TYPE_IQ2_XXS:
         case LM_GGML_TYPE_IQ2_XS:
         case LM_GGML_TYPE_IQ3_XXS:
@@ -13208,7 +13709,7 @@ static void lm_ggml_compute_forward_get_rows_q(
         const int64_t i10 = (i - i12*ne11*ne10 - i11*ne10);
         const int64_t i01 = *(int32_t *) ((char *) src1->data + i10*nb10 + i11*nb11 + i12*nb12);
-        assert(i01 >= 0 && i01 < ne01);
+        LM_GGML_ASSERT(i01 >= 0 && i01 < ne01);
         dequantize_row_q(
                 (const void *) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03),
@@ -13249,7 +13750,7 @@ static void lm_ggml_compute_forward_get_rows_f16(
         const int64_t i10 = (i - i12*ne11*ne10 - i11*ne10);
         const int64_t i01 = *(int32_t *) ((char *) src1->data + i10*nb10 + i11*nb11 + i12*nb12);
-        assert(i01 >= 0 && i01 < ne01);
+        LM_GGML_ASSERT(i01 >= 0 && i01 < ne01);
         lm_ggml_fp16_to_fp32_row(
                 (const void *) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03),
@@ -13290,7 +13791,7 @@ static void lm_ggml_compute_forward_get_rows_bf16(
         const int64_t i10 = (i - i12*ne11*ne10 - i11*ne10);
         const int64_t i01 = *(int32_t *) ((char *) src1->data + i10*nb10 + i11*nb11 + i12*nb12);
-        assert(i01 >= 0 && i01 < ne01);
+        LM_GGML_ASSERT(i01 >= 0 && i01 < ne01);
         lm_ggml_bf16_to_fp32_row(
                 (const void *) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03),
@@ -13331,7 +13832,7 @@ static void lm_ggml_compute_forward_get_rows_f32(
         const int64_t i10 = (i - i12*ne11*ne10 - i11*ne10);
         const int64_t i01 = *(int32_t *) ((char *) src1->data + i10*nb10 + i11*nb11 + i12*nb12);
-        assert(i01 >= 0 && i01 < ne01);
+        LM_GGML_ASSERT(i01 >= 0 && i01 < ne01);
         lm_ggml_vec_cpy_f32(nc,
                 (float *) ((char *)  dst->data + i10*nb1  + i11*nb2  + i12*nb3),
@@ -13357,6 +13858,8 @@ static void lm_ggml_compute_forward_get_rows(
         case LM_GGML_TYPE_Q4_K:
         case LM_GGML_TYPE_Q5_K:
         case LM_GGML_TYPE_Q6_K:
+        case LM_GGML_TYPE_TQ1_0:
+        case LM_GGML_TYPE_TQ2_0:
         case LM_GGML_TYPE_IQ2_XXS:
         case LM_GGML_TYPE_IQ2_XS:
         case LM_GGML_TYPE_IQ3_XXS:
@@ -13606,7 +14109,7 @@ static void lm_ggml_compute_forward_diag_mask_f32(
                 ((char *) src0->data),
                 lm_ggml_nbytes(dst));
         }
-        lm_ggml_barrier(params->shared);
+        lm_ggml_barrier(params->threadpool);
     }
     // TODO: handle transposed/permuted matrices
@@ -13946,6 +14449,8 @@ static void lm_ggml_compute_forward_clamp(
         case LM_GGML_TYPE_Q4_K:
         case LM_GGML_TYPE_Q5_K:
         case LM_GGML_TYPE_Q6_K:
+        case LM_GGML_TYPE_TQ1_0:
+        case LM_GGML_TYPE_TQ2_0:
         case LM_GGML_TYPE_IQ2_XXS:
         case LM_GGML_TYPE_IQ2_XS:
         case LM_GGML_TYPE_IQ3_XXS:
@@ -14382,7 +14887,7 @@ static void lm_ggml_compute_forward_conv_transpose_1d_f16_f32(
         // need to zero dst since we are accumulating into it
         memset(dst->data, 0, lm_ggml_nbytes(dst));
     }
-    lm_ggml_barrier(params->shared);
+    lm_ggml_barrier(params->threadpool);
     const int32_t s0 = ((const int32_t*)(dst->op_params))[0];
@@ -14470,7 +14975,7 @@ static void lm_ggml_compute_forward_conv_transpose_1d_f32(
         // need to zero dst since we are accumulating into it
         memset(dst->data, 0, lm_ggml_nbytes(dst));
     }
-    lm_ggml_barrier(params->shared);
+    lm_ggml_barrier(params->threadpool);
     const int32_t s0 = ((const int32_t*)(dst->op_params))[0];
@@ -14525,6 +15030,7 @@ static void lm_ggml_compute_forward_conv_transpose_1d(
     }
 }
+// lm_ggml_compute_forward_im2col_f32
 // src0: kernel [OC, IC, KH, KW]
 // src1: image [N, IC, IH, IW]
 // dst:  result [N, OH, OW, IC*KH*KW]
@@ -14535,7 +15041,6 @@ static void lm_ggml_compute_forward_im2col_f32(
     const struct lm_ggml_tensor * src0 = dst->src[0];
     const struct lm_ggml_tensor * src1 = dst->src[1];
-    LM_GGML_ASSERT(src0->type == LM_GGML_TYPE_F16);
     LM_GGML_ASSERT(src1->type == LM_GGML_TYPE_F32);
     LM_GGML_ASSERT( dst->type == LM_GGML_TYPE_F32);
@@ -14566,7 +15071,6 @@ static void lm_ggml_compute_forward_im2col_f32(
     int ofs0 = is_2D ? nb13 : nb12;
     int ofs1 = is_2D ? nb12 : nb11;
-    LM_GGML_ASSERT(nb00 == sizeof(lm_ggml_fp16_t));
     LM_GGML_ASSERT(nb10 == sizeof(float));
     // im2col: [N, IC, IH, IW] => [N, OH, OW, IC*KH*KW]
@@ -14602,6 +15106,7 @@ static void lm_ggml_compute_forward_im2col_f32(
 }
+// lm_ggml_compute_forward_im2col_f16
 // src0: kernel [OC, IC, KH, KW]
 // src1: image [N, IC, IH, IW]
 // dst:  result [N, OH, OW, IC*KH*KW]
@@ -14697,6 +15202,99 @@ static void lm_ggml_compute_forward_im2col(
     }
 }
+// lm_ggml_compute_forward_im2col_back_f32
+static void lm_ggml_compute_forward_im2col_back_f32(
+        const struct lm_ggml_compute_params * params,
+              struct lm_ggml_tensor * dst) {
+    const struct lm_ggml_tensor * src0 = dst->src[0];
+    const struct lm_ggml_tensor * src1 = dst->src[1];
+    LM_GGML_ASSERT(src1->type == LM_GGML_TYPE_F32);
+    LM_GGML_ASSERT( dst->type == LM_GGML_TYPE_F32);
+    LM_GGML_TENSOR_BINARY_OP_LOCALS;
+    const int32_t s0 = ((const int32_t *)(dst->op_params))[0];
+    const int32_t s1 = ((const int32_t *)(dst->op_params))[1];
+    const int32_t p0 = ((const int32_t *)(dst->op_params))[2];
+    const int32_t p1 = ((const int32_t *)(dst->op_params))[3];
+    const int32_t d0 = ((const int32_t *)(dst->op_params))[4];
+    const int32_t d1 = ((const int32_t *)(dst->op_params))[5];
+    const bool is_2D = ((const int32_t *)(dst->op_params))[6] == 1;
+    const int ith = params->ith;
+    const int nth = params->nth;
+    const int64_t N  = is_2D ? ne3 : ne2;
+    const int64_t IC = is_2D ? ne2 : ne1;
+    const int64_t IH = is_2D ? ne1 : 1;
+    const int64_t IW = ne0;
+    const int64_t KH = is_2D ? ne01 : 1;
+    const int64_t KW = ne00;
+    const int64_t OH = is_2D ? ne12 : 1;
+    const int64_t OW = ne11;
+    int ofs0 = is_2D ? nb3 : nb2;
+    int ofs1 = is_2D ? nb2 : nb1;
+    LM_GGML_ASSERT(nb0  == sizeof(float));
+    // im2col: [N, IC, IH, IW] => [N, OH, OW, IC*KH*KW]
+    {
+        float * const wdata = (float *) dst->data;
+        for (int64_t in = 0; in < N; in++) {
+            for (int64_t iic = ith; iic < IC; iic += nth) {
+                for (int64_t iih = 0; iih < IH; iih++) {
+                    for (int64_t iiw = 0; iiw < IW; iiw++) {
+                        // micro kernel
+                        float grad = 0.0f;
+                        for (int64_t ikh = 0; ikh < KH; ikh++) {
+                            for (int64_t ikw = 0; ikw < KW; ikw++) {
+                                // For s0 > 1 some values were skipped over in the forward pass.
+                                // These values have tmpw % s0 != 0 and need to be skipped in the backwards pass as well.
+                                const int64_t tmpw = (iiw + p0 - ikw*d0);
+                                if (tmpw % s0 != 0) {
+                                    continue;
+                                }
+                                const int64_t iow = tmpw / s0;
+                                // Equivalent logic as above except for s1.
+                                int64_t ioh;
+                                if (is_2D) {
+                                    const int64_t tmph = iih + p1 - ikh*d1;
+                                    if (tmph % s1 != 0) {
+                                        continue;
+                                    }
+                                    ioh = tmph / s1;
+                                } else {
+                                    ioh = 0;
+                                }
+                                if (iow < 0 || iow >= OW || ioh < 0 || ioh >= OH) {
+                                    continue;
+                                }
+                                const float * const src_data = (const float *) src1->data
+                                    + (in*OH*OW + ioh*OW + iow)*(IC*KH*KW); // [IC, KH, KW]
+                                grad += src_data[iic*(KH*KW) + ikh*KW + ikw];
+                            }
+                        }
+                        float * dst_data = (float *)((char *) wdata + (in*ofs0 + iic*ofs1)); // [IH, IW]
+                        dst_data[iih*IW + iiw] = grad;
+                    }
+                }
+            }
+        }
+    }
+}
 // lm_ggml_compute_forward_conv_transpose_2d
@@ -14757,7 +15355,7 @@ static void lm_ggml_compute_forward_conv_transpose_2d(
         memset(dst->data, 0, lm_ggml_nbytes(dst));
     }
-    lm_ggml_barrier(params->shared);
+    lm_ggml_barrier(params->threadpool);
     const int32_t stride = lm_ggml_get_op_params_i32(dst, 0);
@@ -14939,45 +15537,167 @@ static void lm_ggml_compute_forward_pool_2d(
     }
 }
-// lm_ggml_compute_forward_upscale
+// lm_ggml_compute_forward_pool_2d_back
-static void lm_ggml_compute_forward_upscale_f32(
-    const struct lm_ggml_compute_params * params,
-    struct lm_ggml_tensor * dst) {
+static void lm_ggml_compute_forward_pool_2d_back(
+        const struct lm_ggml_compute_params * params,
+        struct lm_ggml_tensor * dst) {
-    const struct lm_ggml_tensor * src0 = dst->src[0];
+    const struct lm_ggml_tensor * src  = dst->src[0];
+    const struct lm_ggml_tensor * dstf = dst->src[1]; // forward tensor of dst
-    LM_GGML_ASSERT(src0->type == LM_GGML_TYPE_F32);
+    assert(dst->type == LM_GGML_TYPE_F32 || dst->type == LM_GGML_TYPE_F16);
-    const int ith = params->ith;
-    const int nth = params->nth;
+    if (params->ith != 0) {
+        return;
+    }
-    LM_GGML_TENSOR_UNARY_OP_LOCALS
+    const int32_t * opts = (const int32_t *)dst->op_params;
+    enum lm_ggml_op_pool op = opts[0];
+    const int k0 = opts[1];
+    const int k1 = opts[2];
+    const int s0 = opts[3];
+    const int s1 = opts[4];
+    const int p0 = opts[5];
+    const int p1 = opts[6];
-    const float sf0 = (float)ne0/src0->ne[0];
-    const float sf1 = (float)ne1/src0->ne[1];
-    const float sf2 = (float)ne2/src0->ne[2];
-    const float sf3 = (float)ne3/src0->ne[3];
+    char       * cdata  = (char       *) dst->data;
+    const char * cdataf = (const char *) dstf->data;
+    const char * const data_end = cdata + lm_ggml_nbytes(dst);
-    // TODO: optimize
+    LM_GGML_ASSERT(params->ith == 0);
+    memset(cdata, 0, lm_ggml_nbytes(dst));
-    for (int64_t i3 = 0; i3 < ne3; i3++) {
-        const int64_t i03 = i3 / sf3;
-        for (int64_t i2 = ith; i2 < ne2; i2 += nth) {
-            const int64_t i02 = i2 / sf2;
-            for (int64_t i1 = 0; i1 < ne1; i1++) {
-                const int64_t i01 = i1 / sf1;
-                for (int64_t i0 = 0; i0 < ne0; i0++) {
-                    const int64_t i00 = i0 / sf0;
+    const int64_t px = src->ne[0];
+    const int64_t py = src->ne[1];
+    const int64_t pa = px * py;
-                    const float * x = (float *)((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
-                          float * y = (float *)((char *)  dst->data +  i0*nb0  +  i1*nb1  +  i2*nb2  +  i3*nb3);
+    const float * splane = (const float *) src->data;
-                    *y = *x;
-                }
-            }
-        }
-    }
+    const int ka = k0 * k1;
+    const int offset0 = -p0;
+    const int offset1 = -p1;
+    while (cdata < data_end) {
+        for (int oy = 0; oy < py; ++oy) {
+            const float * const srow = splane + oy * px;
+            for (int ox = 0; ox < px; ++ox) {
+                const float grad0 = srow[ox];
+                const int ix = offset0 + ox * s0;
+                const int iy = offset1 + oy * s1;
+                if (op == LM_GGML_OP_POOL_MAX) {
+                    float maxval = -FLT_MAX;
+                    int kxmax = -1;
+                    int kymax = -1;
+                    for (int ky = 0; ky < k1; ++ky) {
+                        if (iy + ky < 0 || iy + ky >= dst->ne[1]) {
+                            continue;
+                        }
+                        const void * drowf = (const void *)(cdataf + dst->nb[1] * (iy + ky));
+                        for (int kx = 0; kx < k0; ++kx) {
+                            int j = ix + kx;
+                            if (j < 0 || j >= dst->ne[0]) {
+                                continue;
+                            }
+                            const float val = dst->type == LM_GGML_TYPE_F32 ?
+                                ((const float *) drowf)[j] : LM_GGML_FP16_TO_FP32(((const lm_ggml_fp16_t *) drowf)[j]);
+                            if (val <= maxval) {
+                                continue;
+                            }
+                            maxval = val;
+                            kxmax = kx;
+                            kymax = ky;
+                        }
+                    }
+                    if (kxmax == -1 || kymax == -1) {
+                        continue;
+                    }
+                    void * drow = (void *)(cdata + dst->nb[1] * (iy + kymax));
+                    const int j = ix + kxmax;
+                    if (dst->type == LM_GGML_TYPE_F32) {
+                        ((float *) drow)[j] += grad0;
+                    } else {
+                        ((lm_ggml_fp16_t *) drow)[j] = LM_GGML_FP32_TO_FP16(grad0 + LM_GGML_FP16_TO_FP32(((const lm_ggml_fp16_t *) drow)[j]));
+                    }
+                } else if (op == LM_GGML_OP_POOL_AVG) {
+                    const float grad = grad0 / ka;
+                    for (int ky = 0; ky < k1; ++ky) {
+                        if (iy + ky < 0 || iy + ky >= dst->ne[1]) {
+                            continue;
+                        }
+                        void * drow = (void *)(cdata + dst->nb[1] * (iy + ky));
+                        for (int kx = 0; kx < k0; ++kx) {
+                            int j = ix + kx;
+                            if (j < 0 || j >= dst->ne[0]) {
+                                continue;
+                            }
+                            if (dst->type == LM_GGML_TYPE_F32) {
+                                ((float *) drow)[j] += grad;
+                            } else {
+                                ((lm_ggml_fp16_t *) drow)[j] += LM_GGML_FP32_TO_FP16(grad);
+                            }
+                        }
+                    }
+                } else {
+                    LM_GGML_ASSERT(false);
+                }
+            }
+        }
+        cdata  += dst->nb[2];
+        cdataf += dst->nb[2];
+        splane += pa;
+    }
+}
+// lm_ggml_compute_forward_upscale
+static void lm_ggml_compute_forward_upscale_f32(
+    const struct lm_ggml_compute_params * params,
+    struct lm_ggml_tensor * dst) {
+    const struct lm_ggml_tensor * src0 = dst->src[0];
+    LM_GGML_ASSERT(src0->type == LM_GGML_TYPE_F32);
+    const int ith = params->ith;
+    const int nth = params->nth;
+    LM_GGML_TENSOR_UNARY_OP_LOCALS
+    const float sf0 = (float)ne0/src0->ne[0];
+    const float sf1 = (float)ne1/src0->ne[1];
+    const float sf2 = (float)ne2/src0->ne[2];
+    const float sf3 = (float)ne3/src0->ne[3];
+    // TODO: optimize
+    for (int64_t i3 = 0; i3 < ne3; i3++) {
+        const int64_t i03 = i3 / sf3;
+        for (int64_t i2 = ith; i2 < ne2; i2 += nth) {
+            const int64_t i02 = i2 / sf2;
+            for (int64_t i1 = 0; i1 < ne1; i1++) {
+                const int64_t i01 = i1 / sf1;
+                for (int64_t i0 = 0; i0 < ne0; i0++) {
+                    const int64_t i00 = i0 / sf0;
+                    const float * x = (float *)((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
+                          float * y = (float *)((char *)  dst->data +  i0*nb0  +  i1*nb1  +  i2*nb2  +  i3*nb3);
+                    *y = *x;
+                }
+            }
+        }
+    }
 }
 static void lm_ggml_compute_forward_upscale(
@@ -15503,7 +16223,7 @@ static void lm_ggml_compute_forward_flash_attn_back_f32(
     if (ith == 0) {
         memset(dst->data, 0, nb0*ne0*ne1*ne2*ne3);
     }
-    lm_ggml_barrier(params->shared);
+    lm_ggml_barrier(params->threadpool);
     const int64_t elem_q = lm_ggml_nelements(q);
     const int64_t elem_k = lm_ggml_nelements(k);
@@ -16125,6 +16845,10 @@ static void lm_ggml_compute_forward_unary(
             {
                 lm_ggml_compute_forward_hardsigmoid(params, dst);
             } break;
+        case LM_GGML_UNARY_OP_EXP:
+            {
+                lm_ggml_compute_forward_exp(params, dst);
+            } break;
         default:
             {
                 LM_GGML_ABORT("fatal error");
@@ -16194,7 +16918,7 @@ static void lm_ggml_compute_forward_add_rel_pos_f32(
         if (params->ith == 0) {
             memcpy((char *) dst->data, (char *) src0->data, lm_ggml_nbytes(dst));
         }
-        lm_ggml_barrier(params->shared);
+        lm_ggml_barrier(params->threadpool);
     }
     // ref: https://github.com/facebookresearch/segment-anything/blob/main/segment_anything/modeling/image_encoder.py#L357-L359
@@ -16260,6 +16984,96 @@ static void lm_ggml_compute_forward_add_rel_pos(
     }
 }
+// lm_ggml_compute_forward_rwkv_wkv
+static void lm_ggml_compute_forward_rwkv_wkv_f32(
+        const struct lm_ggml_compute_params * params,
+        struct lm_ggml_tensor * dst) {
+    const size_t T = dst->src[1]->ne[3];
+    const size_t C = dst->ne[0];
+    const size_t H = dst->src[1]->ne[2];
+    const size_t n_seqs = dst->src[5]->ne[1];
+    float * dst_data = (float *) dst->data;
+    float * state = ((float *) dst->data) + C * T;
+    if (params->ith != 0) {
+        return;
+    }
+    memset(dst_data, 0, T * C * sizeof(float));
+    float * k =          (float *) dst->src[0]->data;
+    float * v =          (float *) dst->src[1]->data;
+    float * r =          (float *) dst->src[2]->data;
+    float * time_faaaa = (float *) dst->src[3]->data;
+    float * time_decay = (float *) dst->src[4]->data;
+    size_t t_stride = H * (C / H);
+    size_t h_stride = C / H;
+    size_t h_stride_2d = (C / H) * (C / H);
+    // basically fused operations:
+    // dst = r @ (time_faaaa * (k @ v) + state),
+    // state = time_decay * state + (k @ v),
+    // recursive through each token
+    for (size_t t = 0; t < T; t++) {
+        size_t t_offset = t * t_stride;
+        size_t state_offset = (C / H) * C * (t / (T / n_seqs));
+        float * state_cur = state + state_offset;
+        float * state_prev = t % (T / n_seqs) ? state_cur : (float*)dst->src[5]->data + state_offset;
+        for (size_t h = 0; h < H; h++) {
+            size_t h_offset = h * h_stride;
+            size_t t_h_offset = t_offset + h_offset;
+            size_t h_2d_offset = h * h_stride_2d;
+            for (size_t i = 0; i < C / H; i++) {
+                size_t t_h_i_offset = t_h_offset + i;
+                size_t h_i_offset = h_offset + i;
+                size_t h_2d_i_offset = h_2d_offset + i * h_stride;
+                float k_val = k[t_h_i_offset];
+                float r_val = r[t_h_i_offset];
+                float time_faaaa_val = time_faaaa[h_i_offset];
+                // RWKV v6: different time_decay for each token.
+                float time_decay_val = time_decay[t_h_i_offset];
+                for (size_t j = 0; j < C / H; j ++) {
+                    size_t t_h_j_offset = t_h_offset + j;
+                    size_t h_2d_i_j_offset = h_2d_i_offset + j;
+                    float v_val = v[t_h_j_offset];
+                    float kv_val = v_val * k_val;
+                    float prev_state_val = state_prev[h_2d_i_j_offset];
+                    float temp_val = kv_val * time_faaaa_val + prev_state_val;
+                    dst_data[t_h_j_offset] += temp_val * r_val;
+                    state_cur[h_2d_i_j_offset] = prev_state_val * time_decay_val + kv_val;
+                }
+            }
+        }
+    }
+}
+static void lm_ggml_compute_forward_rwkv_wkv(
+        const struct lm_ggml_compute_params * params,
+        struct lm_ggml_tensor * dst) {
+    const struct lm_ggml_tensor * src0 = dst->src[0];
+    switch (src0->type) {
+        case LM_GGML_TYPE_F32:
+            {
+                lm_ggml_compute_forward_rwkv_wkv_f32(params, dst);
+            } break;
+        default:
+            {
+                LM_GGML_ABORT("fatal error");
+            }
+    }
+}
 // lm_ggml_compute_forward_map_unary
 static void lm_ggml_compute_forward_map_unary_f32(
@@ -16479,9 +17293,7 @@ static void lm_ggml_compute_forward_cross_entropy_loss_f32(
     if (ith == 0) {
         memset(sums, 0, sizeof(float) * (nth + nth * nc));
     }
-    lm_ggml_barrier(params->shared);
-    const double eps = 1e-9;
+    lm_ggml_barrier(params->threadpool);
     // rows per thread
     const int dr = (nr + nth - 1)/nth;
@@ -16503,20 +17315,15 @@ static void lm_ggml_compute_forward_cross_entropy_loss_f32(
         }
 #endif
-        // soft_max
         float max = -INFINITY;
         lm_ggml_vec_max_f32(nc, &max, s0);
-        lm_ggml_float sum = lm_ggml_vec_soft_max_f32(nc, st, s0, max);
-        assert(sum > 0.0);
-        sum = (1.0 - eps) / sum;
+        lm_ggml_float sum = lm_ggml_vec_log_soft_max_f32(nc, st, s0, max);
+        assert(sum >= 0.0);
-        // avoid log(0) by rescaling from [0..1] to [eps..1]
-        lm_ggml_vec_scale_f32(nc, st, sum);
-        lm_ggml_vec_add1_f32(nc, st, st, eps);
-        lm_ggml_vec_log_f32(nc, st, st);
+        lm_ggml_vec_add1_f32(nc, st, st, -sum);
         lm_ggml_vec_mul_f32(nc, st, st, s1);
-        float st_sum = 0;
+        float st_sum = 0.0f;
         lm_ggml_vec_sum_f32(nc, &st_sum, st);
         sums[ith] += st_sum;
@@ -16527,7 +17334,7 @@ static void lm_ggml_compute_forward_cross_entropy_loss_f32(
         }
 #endif
     }
-    lm_ggml_barrier(params->shared);
+    lm_ggml_barrier(params->threadpool);
     if (ith == 0) {
         float * dp = (float *) dst->data;
@@ -16573,8 +17380,6 @@ static void lm_ggml_compute_forward_cross_entropy_loss_back_f32(
     const int64_t ith = params->ith;
     const int64_t nth = params->nth;
-    const double eps = 1e-9;
     // TODO: handle transposed/permuted matrices
     const int64_t nc = src0->ne[0];
     const int64_t nr = lm_ggml_nrows(src0);
@@ -16606,11 +17411,9 @@ static void lm_ggml_compute_forward_cross_entropy_loss_back_f32(
         lm_ggml_vec_max_f32(nc, &max, s0);
         lm_ggml_float sum = lm_ggml_vec_soft_max_f32(nc, ds0, s0, max);
         assert(sum > 0.0);
-        sum = (1.0 - eps) / sum;
+        lm_ggml_vec_scale_f32(nc, ds0, 1.0/sum);
         // grad(src0) = (softmax(src0) - src1) * grad(cross_entropy_loss(src0, src1)) / nr
-        lm_ggml_vec_scale_f32(nc, ds0, sum);
-        lm_ggml_vec_add1_f32(nc, ds0, ds0, eps);
         lm_ggml_vec_sub_f32(nc, ds0, ds0, s1);
         lm_ggml_vec_scale_f32(nc, ds0, d[0] / (float) nr);
@@ -16691,6 +17494,14 @@ static void lm_ggml_compute_forward(struct lm_ggml_compute_params * params, stru
             {
                 lm_ggml_compute_forward_log(params, tensor);
             } break;
+        case LM_GGML_OP_SIN:
+            {
+                lm_ggml_compute_forward_sin(params, tensor);
+            } break;
+        case LM_GGML_OP_COS:
+            {
+                lm_ggml_compute_forward_cos(params, tensor);
+            } break;
         case LM_GGML_OP_SUM:
             {
                 lm_ggml_compute_forward_sum(params, tensor);
@@ -16831,6 +17642,10 @@ static void lm_ggml_compute_forward(struct lm_ggml_compute_params * params, stru
             {
                 lm_ggml_compute_forward_im2col(params, tensor);
             } break;
+        case LM_GGML_OP_IM2COL_BACK:
+            {
+                lm_ggml_compute_forward_im2col_back_f32(params, tensor);
+            } break;
         case LM_GGML_OP_CONV_TRANSPOSE_2D:
             {
                 lm_ggml_compute_forward_conv_transpose_2d(params, tensor);
@@ -16843,6 +17658,10 @@ static void lm_ggml_compute_forward(struct lm_ggml_compute_params * params, stru
             {
                 lm_ggml_compute_forward_pool_2d(params, tensor);
             } break;
+        case LM_GGML_OP_POOL_2D_BACK:
+            {
+                lm_ggml_compute_forward_pool_2d_back(params, tensor);
+            } break;
         case LM_GGML_OP_UPSCALE:
             {
                 lm_ggml_compute_forward_upscale(params, tensor);
@@ -16906,6 +17725,10 @@ static void lm_ggml_compute_forward(struct lm_ggml_compute_params * params, stru
             {
                 lm_ggml_compute_forward_add_rel_pos(params, tensor);
             } break;
+        case LM_GGML_OP_RWKV_WKV:
+            {
+                lm_ggml_compute_forward_rwkv_wkv(params, tensor);
+            } break;
         case LM_GGML_OP_MAP_UNARY:
             {
                 lm_ggml_unary_op_f32_t fun;
@@ -17211,7 +18034,11 @@ static void lm_ggml_compute_backward(struct lm_ggml_context * ctx, struct lm_ggm
                     src0->grad = lm_ggml_add_or_set(ctx, src0->grad, tensor->grad, zero_table);
                 }
                 if (src1->grad) {
-                    src1->grad = lm_ggml_add_or_set(ctx, src1->grad, tensor->grad, zero_table);
+                    if (lm_ggml_are_same_shape(src0, src1)) {
+                        src1->grad = lm_ggml_add_or_set(ctx, src1->grad,                       tensor->grad,        zero_table);
+                    } else {
+                        src1->grad = lm_ggml_add_or_set(ctx, src1->grad, lm_ggml_repeat_back(ctx, tensor->grad, src1), zero_table);
+                    }
                 }
             } break;
         case LM_GGML_OP_ADD1:
@@ -17337,6 +18164,30 @@ static void lm_ggml_compute_backward(struct lm_ggml_context * ctx, struct lm_ggm
                                 zero_table);
                 }
             } break;
+        case LM_GGML_OP_SIN:
+            {
+                if (src0->grad) {
+                    src0->grad =
+                        lm_ggml_add_or_set(ctx,
+                                src0->grad,
+                                lm_ggml_mul(ctx,
+                                    tensor->grad,
+                                    lm_ggml_cos(ctx, src0)),
+                                zero_table);
+                }
+            } break;
+        case LM_GGML_OP_COS:
+            {
+                if (src0->grad) {
+                    src0->grad =
+                        lm_ggml_sub_or_set(ctx,
+                                src0->grad,
+                                lm_ggml_mul(ctx,
+                                    tensor->grad,
+                                    lm_ggml_sin(ctx, src0)),
+                                zero_table);
+                }
+            } break;
         case LM_GGML_OP_SUM:
             {
                 if (src0->grad) {
@@ -17509,14 +18360,10 @@ static void lm_ggml_compute_backward(struct lm_ggml_context * ctx, struct lm_ggm
                 if (src0->grad || src1->grad) {
                     LM_GGML_ASSERT(src0->type == tensor->type);
                     LM_GGML_ASSERT(tensor->grad->type == tensor->type);
-                    LM_GGML_ASSERT(tensor->grad->type == src1->grad->type);
+                    LM_GGML_ASSERT(!src1->grad || src1->grad->type == tensor->grad->type);
                     tensor_grad_view = lm_ggml_view_4d(ctx,
-                        tensor->grad,
-                        src1->grad->ne[0],
-                        src1->grad->ne[1],
-                        src1->grad->ne[2],
-                        src1->grad->ne[3],
+                        tensor->grad, src1->ne[0], src1->ne[1], src1->ne[2], src1->ne[3],
                         nb1, nb2, nb3, offset);
                 }
@@ -17585,9 +18432,9 @@ static void lm_ggml_compute_backward(struct lm_ggml_context * ctx, struct lm_ggm
                     memcpy(&offset, tensor->op_params, sizeof(offset));
-                    size_t nb1     = tensor->nb[1];
-                    size_t nb2     = tensor->nb[2];
-                    size_t nb3     = tensor->nb[3];
+                    size_t nb1 = tensor->nb[1];
+                    size_t nb2 = tensor->nb[2];
+                    size_t nb3 = tensor->nb[3];
                     if (src0->type != src0->grad->type) {
                         // gradient is typically F32, but src0 could be other type
@@ -17784,6 +18631,23 @@ static void lm_ggml_compute_backward(struct lm_ggml_context * ctx, struct lm_ggm
                 LM_GGML_ABORT("fatal error"); // TODO: not implemented
             }
         case LM_GGML_OP_IM2COL:
+            {
+                if (src1->grad) {
+                    const int32_t s0    = lm_ggml_get_op_params_i32(tensor, 0);
+                    const int32_t s1    = lm_ggml_get_op_params_i32(tensor, 1);
+                    const int32_t p0    = lm_ggml_get_op_params_i32(tensor, 2);
+                    const int32_t p1    = lm_ggml_get_op_params_i32(tensor, 3);
+                    const int32_t d0    = lm_ggml_get_op_params_i32(tensor, 4);
+                    const int32_t d1    = lm_ggml_get_op_params_i32(tensor, 5);
+                    const bool    is_2D = lm_ggml_get_op_params_i32(tensor, 6) == 1;
+                    src1->grad = lm_ggml_add_or_set(ctx,
+                            src1->grad,
+                            lm_ggml_im2col_back(ctx, src0, tensor->grad, src1->ne, s0, s1, p0, p1, d0, d1, is_2D),
+                            zero_table);
+                }
+            } break;
+        case LM_GGML_OP_IM2COL_BACK:
             {
                 LM_GGML_ABORT("fatal error"); // TODO: not implemented
             }
@@ -17796,6 +18660,23 @@ static void lm_ggml_compute_backward(struct lm_ggml_context * ctx, struct lm_ggm
                 LM_GGML_ABORT("fatal error"); // TODO: not implemented
             }
         case LM_GGML_OP_POOL_2D:
+            {
+                if (src0->grad) {
+                    const enum lm_ggml_op_pool op = lm_ggml_get_op_params_i32(tensor, 0);
+                    const      int32_t      k0 = lm_ggml_get_op_params_i32(tensor, 1);
+                    const      int32_t      k1 = lm_ggml_get_op_params_i32(tensor, 2);
+                    const      int32_t      s0 = lm_ggml_get_op_params_i32(tensor, 3);
+                    const      int32_t      s1 = lm_ggml_get_op_params_i32(tensor, 4);
+                    const      int32_t      p0 = lm_ggml_get_op_params_i32(tensor, 5);
+                    const      int32_t      p1 = lm_ggml_get_op_params_i32(tensor, 6);
+                    src0->grad = lm_ggml_add_or_set(ctx,
+                            src0->grad,
+                            lm_ggml_pool_2d_back(ctx, tensor->grad, src0, op, k0, k1, s0, s1, p0, p1),
+                            zero_table);
+                }
+            } break;
+        case LM_GGML_OP_POOL_2D_BACK:
             {
                 LM_GGML_ABORT("fatal error"); // TODO: not implemented
             }
@@ -17961,12 +18842,22 @@ static void lm_ggml_compute_backward(struct lm_ggml_context * ctx, struct lm_ggm
                                         zero_table);
                             }
                         } break;
+                    case LM_GGML_UNARY_OP_EXP:
+                        {
+                            if (src0->grad) {
+                                src0->grad = lm_ggml_add_or_set(ctx,
+                                        src0->grad,
+                                        lm_ggml_mul(ctx, tensor, tensor->grad),
+                                        zero_table);
+                            }
+                        } break;
                     default:
                         LM_GGML_ABORT("fatal error");
                 }
             } break;
         case LM_GGML_OP_GET_REL_POS:
         case LM_GGML_OP_ADD_REL_POS:
+        case LM_GGML_OP_RWKV_WKV:
         case LM_GGML_OP_MAP_UNARY:
         case LM_GGML_OP_MAP_BINARY:
         case LM_GGML_OP_MAP_CUSTOM1_F32:
@@ -18085,6 +18976,7 @@ void lm_ggml_build_forward_expand(struct lm_ggml_cgraph * cgraph, struct lm_ggml
 void lm_ggml_build_backward_expand(struct lm_ggml_context * ctx, struct lm_ggml_cgraph * gf, struct lm_ggml_cgraph * gb, bool keep) {
     LM_GGML_ASSERT(gf->n_nodes > 0);
+    LM_GGML_ASSERT(gf->grads);
     // if we are keeping the gradient graph, we have to detach the gradient nodes from the original graph
     if (keep) {
@@ -18238,7 +19130,8 @@ void lm_ggml_graph_cpy(struct lm_ggml_cgraph * src, struct lm_ggml_cgraph * dst)
     }
     for (size_t i = 0; i < src->visited_hash_set.size; ++i) {
-        if (src->visited_hash_set.keys[i]) {
+        // copy all hashset keys (tensors) that are in use
+        if (lm_ggml_bitset_get(src->visited_hash_set.used, i)) {
             lm_ggml_hash_insert(&dst->visited_hash_set, src->visited_hash_set.keys[i]);
         }
     }
@@ -18268,65 +19161,6 @@ void lm_ggml_graph_clear(struct lm_ggml_cgraph * cgraph) {
     lm_ggml_hash_set_reset(&cgraph->visited_hash_set);
 }
-//
-// thread data
-//
-// synchronization is done via busy loops
-// I tried using spin locks, but not sure how to use them correctly - the things I tried were slower than busy loops
-//
-#ifdef __APPLE__
-//#include <os/lock.h>
-//
-//typedef os_unfair_lock lm_ggml_lock_t;
-//
-//#define lm_ggml_lock_init(x)    UNUSED(x)
-//#define lm_ggml_lock_destroy(x) UNUSED(x)
-//#define lm_ggml_lock_lock       os_unfair_lock_lock
-//#define lm_ggml_lock_unlock     os_unfair_lock_unlock
-//
-//#define LM_GGML_LOCK_INITIALIZER OS_UNFAIR_LOCK_INIT
-typedef int lm_ggml_lock_t;
-#define lm_ggml_lock_init(x)    UNUSED(x)
-#define lm_ggml_lock_destroy(x) UNUSED(x)
-#define lm_ggml_lock_lock(x)    UNUSED(x)
-#define lm_ggml_lock_unlock(x)  UNUSED(x)
-#define LM_GGML_LOCK_INITIALIZER 0
-#define lm_ggml_thread_create pthread_create
-#define lm_ggml_thread_join   pthread_join
-#else
-//typedef pthread_spinlock_t lm_ggml_lock_t;
-//#define lm_ggml_lock_init(x) pthread_spin_init(x, PTHREAD_PROCESS_PRIVATE)
-//#define lm_ggml_lock_destroy pthread_spin_destroy
-//#define lm_ggml_lock_lock    pthread_spin_lock
-//#define lm_ggml_lock_unlock  pthread_spin_unlock
-typedef int lm_ggml_lock_t;
-#define lm_ggml_lock_init(x)    UNUSED(x)
-#define lm_ggml_lock_destroy(x) UNUSED(x)
-#if defined(__x86_64__) || (defined(_MSC_VER) && defined(_M_AMD64))
-#define lm_ggml_lock_lock(x)    _mm_pause()
-#else
-#define lm_ggml_lock_lock(x)    UNUSED(x)
-#endif
-#define lm_ggml_lock_unlock(x)  UNUSED(x)
-#define LM_GGML_LOCK_INITIALIZER 0
-#define lm_ggml_thread_create pthread_create
-#define lm_ggml_thread_join   pthread_join
-#endif
 // Android's libc implementation "bionic" does not support setting affinity
 #if defined(__gnu_linux__)
 static void set_numa_thread_affinity(int thread_n) {
@@ -18424,6 +19258,8 @@ static int lm_ggml_get_n_tasks(struct lm_ggml_tensor * node, int n_threads) {
         case LM_GGML_OP_SQR:
         case LM_GGML_OP_SQRT:
         case LM_GGML_OP_LOG:
+        case LM_GGML_OP_SIN:
+        case LM_GGML_OP_COS:
         case LM_GGML_OP_SUM:
         case LM_GGML_OP_SUM_ROWS:
         case LM_GGML_OP_MEAN:
@@ -18446,6 +19282,7 @@ static int lm_ggml_get_n_tasks(struct lm_ggml_tensor * node, int n_threads) {
                 case LM_GGML_UNARY_OP_SIGMOID:
                 case LM_GGML_UNARY_OP_HARDSWISH:
                 case LM_GGML_UNARY_OP_HARDSIGMOID:
+                case LM_GGML_UNARY_OP_EXP:
                     {
                         n_tasks = 1;
                     } break;
@@ -18510,6 +19347,7 @@ static int lm_ggml_get_n_tasks(struct lm_ggml_tensor * node, int n_threads) {
                 n_tasks = MIN(n_threads, lm_ggml_nrows(node->src[0]));
             } break;
         case LM_GGML_OP_IM2COL:
+        case LM_GGML_OP_IM2COL_BACK:
         case LM_GGML_OP_CONV_TRANSPOSE_1D:
         case LM_GGML_OP_CONV_TRANSPOSE_2D:
             {
@@ -18517,6 +19355,7 @@ static int lm_ggml_get_n_tasks(struct lm_ggml_tensor * node, int n_threads) {
             } break;
         case LM_GGML_OP_POOL_1D:
         case LM_GGML_OP_POOL_2D:
+        case LM_GGML_OP_POOL_2D_BACK:
             {
                 n_tasks = 1;
             } break;
@@ -18535,6 +19374,7 @@ static int lm_ggml_get_n_tasks(struct lm_ggml_tensor * node, int n_threads) {
         case LM_GGML_OP_WIN_PART:
         case LM_GGML_OP_WIN_UNPART:
         case LM_GGML_OP_GET_REL_POS:
+        case LM_GGML_OP_RWKV_WKV:
         case LM_GGML_OP_MAP_UNARY:
         case LM_GGML_OP_MAP_BINARY:
         case LM_GGML_OP_MAP_CUSTOM1_F32:
@@ -18603,9 +19443,281 @@ static int lm_ggml_get_n_tasks(struct lm_ggml_tensor * node, int n_threads) {
     return n_tasks;
 }
-struct lm_ggml_cplan lm_ggml_graph_plan(const struct lm_ggml_cgraph * cgraph, int n_threads) {
+static thread_ret_t lm_ggml_graph_compute_secondary_thread(void* data);
+#if defined(_WIN32)
+#include "windows.h"
+// TODO: support > 64 CPUs
+bool lm_ggml_thread_apply_affinity(bool * mask) {
+    HANDLE    h = GetCurrentThread();
+    uint64_t  bitmask = 0ULL;
+    assert(LM_GGML_MAX_N_THREADS >= 64);
+    for (int32_t i = 0; i < 8; i++) {
+        int32_t idx = i * 8;
+        uint8_t val = 0;
+        val |= mask[idx + 0] << 0;
+        val |= mask[idx + 1] << 1;
+        val |= mask[idx + 2] << 2;
+        val |= mask[idx + 3] << 3;
+        val |= mask[idx + 4] << 4;
+        val |= mask[idx + 5] << 5;
+        val |= mask[idx + 6] << 6;
+        val |= mask[idx + 7] << 7;
+        bitmask |= (uint64_t)val << idx;
+    }
+    for (int32_t i = 64; i < LM_GGML_MAX_N_THREADS; i++) {
+        if (mask[i]) {
+            fprintf(stderr, "warn: setting thread-affinity for > 64 CPUs isn't supported on windows!\n");
+            break;
+        }
+    }
+    DWORD_PTR m = (DWORD_PTR)bitmask;
+    m = SetThreadAffinityMask(h, m);
+    return m != 0;
+}
+static bool lm_ggml_thread_apply_priority(int32_t prio) {
+    // Note that on Windows the Process Priority Class must be updated in order to set Thread priority.
+    // This is up to the applications.
+    DWORD p = THREAD_PRIORITY_NORMAL;
+    switch (prio) {
+        case LM_GGML_SCHED_PRIO_NORMAL:   p = THREAD_PRIORITY_NORMAL;        break;
+        case LM_GGML_SCHED_PRIO_MEDIUM:   p = THREAD_PRIORITY_ABOVE_NORMAL;  break;
+        case LM_GGML_SCHED_PRIO_HIGH:     p = THREAD_PRIORITY_HIGHEST;       break;
+        case LM_GGML_SCHED_PRIO_REALTIME: p = THREAD_PRIORITY_TIME_CRITICAL; break;
+    }
+    if (prio == LM_GGML_SCHED_PRIO_NORMAL) {
+        // Keep inherited policy/priority
+        return true;
+    }
+    if (!SetThreadPriority(GetCurrentThread(), p)) {
+        fprintf(stderr, "warn: failed to set thread priority %d : (%d)\n", prio, (int) GetLastError());
+        return false;
+    }
+    return true;
+}
+#elif defined(__APPLE__)
+#include <sys/types.h>
+#include <sys/resource.h>
+static bool lm_ggml_thread_apply_affinity(const bool * mask) {
+    // Not supported on Apple platforms
+    UNUSED(mask);
+    return true;
+}
+static bool lm_ggml_thread_apply_priority(int32_t prio) {
+    struct sched_param p;
+    int32_t policy = SCHED_OTHER;
+    switch (prio) {
+        case LM_GGML_SCHED_PRIO_NORMAL:   policy = SCHED_OTHER; p.sched_priority = 0;  break;
+        case LM_GGML_SCHED_PRIO_MEDIUM:   policy = SCHED_FIFO;  p.sched_priority = 40; break;
+        case LM_GGML_SCHED_PRIO_HIGH:     policy = SCHED_FIFO;  p.sched_priority = 80; break;
+        case LM_GGML_SCHED_PRIO_REALTIME: policy = SCHED_FIFO;  p.sched_priority = 90; break;
+    }
+    if (prio == LM_GGML_SCHED_PRIO_NORMAL) {
+        // Keep inherited policy/priority
+        return true;
+    }
+    int32_t err = pthread_setschedparam(pthread_self(), policy, &p);
+    if (err != 0) {
+        fprintf(stderr, "warn: failed to set thread priority %d : %s (%d)\n", prio, strerror(err), err);
+        return false;
+    }
+    return true;
+}
+#elif defined(__gnu_linux__)
+// TODO: this may not work on BSD, to be verified
+static bool lm_ggml_thread_apply_affinity(const bool * mask) {
+    cpu_set_t cpuset;
+    int err;
+    CPU_ZERO(&cpuset);
+    for (uint32_t i = 0; i < LM_GGML_MAX_N_THREADS; i++) {
+        if (mask[i]) {
+            LM_GGML_PRINT_DEBUG("Thread %lx: adding %d to cpuset\n", pthread_self(), i);
+            CPU_SET(i, &cpuset);
+        }
+    }
+#ifdef __ANDROID__
+    err = sched_setaffinity(0, sizeof(cpuset), &cpuset);
+    if (err < 0) {
+        err = errno;
+    }
+#else
+    err = pthread_setaffinity_np(pthread_self(), sizeof(cpuset), &cpuset);
+#endif
+    if (err != 0) {
+        fprintf(stderr, "warn: failed to set affinity mask 0x%llx : %s (%d)\n", (unsigned long long)mask, strerror(err), err);
+        return false;
+    }
+    return true;
+}
+static bool lm_ggml_thread_apply_priority(int32_t prio) {
+    struct sched_param p;
+    int32_t policy = SCHED_OTHER;
+    switch (prio) {
+        case LM_GGML_SCHED_PRIO_NORMAL:   policy = SCHED_OTHER; p.sched_priority = 0;  break;
+        case LM_GGML_SCHED_PRIO_MEDIUM:   policy = SCHED_FIFO;  p.sched_priority = 40; break;
+        case LM_GGML_SCHED_PRIO_HIGH:     policy = SCHED_FIFO;  p.sched_priority = 80; break;
+        case LM_GGML_SCHED_PRIO_REALTIME: policy = SCHED_FIFO;  p.sched_priority = 90; break;
+    }
+    if (prio == LM_GGML_SCHED_PRIO_NORMAL) {
+        // Keep inherited policy/priority
+        return true;
+    }
+    int32_t err = pthread_setschedparam(pthread_self(), policy, &p);
+    if (err != 0) {
+        fprintf(stderr, "warn: failed to set thread priority %d : %s (%d)\n", prio, strerror(err), err);
+        return false;
+    }
+    return true;
+}
+#else // unsupported platforms
+static bool lm_ggml_thread_apply_affinity(const bool * mask) {
+    UNUSED(mask);
+    return true;
+}
+static bool lm_ggml_thread_apply_priority(int32_t prio) {
+    UNUSED(prio);
+    return true;
+}
+#endif
+static bool lm_ggml_thread_cpumask_is_valid(const bool * mask) {
+    for (int i = 0; i < LM_GGML_MAX_N_THREADS; i++) {
+        if (mask[i]) { return true; }
+    }
+    return false;
+}
+static void lm_ggml_thread_cpumask_next(const bool * global_mask, bool * local_mask, bool strict, int32_t* iter) {
+    if (!strict) {
+        memcpy(local_mask, global_mask, LM_GGML_MAX_N_THREADS);
+        return;
+    } else {
+        memset(local_mask, 0, LM_GGML_MAX_N_THREADS);
+        int32_t base_idx = *iter;
+        for (int32_t i = 0; i < LM_GGML_MAX_N_THREADS; i++) {
+            int32_t idx = base_idx + i;
+            if (idx >= LM_GGML_MAX_N_THREADS) {
+                // Just a cheaper modulo
+                idx -= LM_GGML_MAX_N_THREADS;
+            }
+            if (global_mask[idx]) {
+                local_mask[idx] = 1;
+                *iter = idx + 1;
+                return;
+            }
+        }
+    }
+}
+void lm_ggml_threadpool_free(struct lm_ggml_threadpool* threadpool) {
+    if (!threadpool) return;
+#ifndef LM_GGML_USE_OPENMP
+    struct lm_ggml_compute_state* workers = threadpool->workers;
+    const int n_threads = threadpool->n_threads_max;
+    lm_ggml_mutex_lock(&threadpool->mutex);
+    threadpool->stop = true;
+    threadpool->pause = false;
+    lm_ggml_cond_broadcast(&threadpool->cond);
+    lm_ggml_mutex_unlock(&threadpool->mutex);
+    for (int j = 1; j < n_threads; j++) {
+        int32_t rc = lm_ggml_thread_join(workers[j].thrd, NULL);
+        LM_GGML_ASSERT(rc == LM_GGML_EXIT_SUCCESS || rc == LM_GGML_EXIT_ABORTED);
+        UNUSED(rc);
+    }
+    lm_ggml_mutex_destroy(&threadpool->mutex);
+    lm_ggml_cond_destroy(&threadpool->cond);
+#endif // LM_GGML_USE_OPENMP
+    LM_GGML_ALIGNED_FREE(threadpool->workers);
+    LM_GGML_ALIGNED_FREE(threadpool);
+}
+#ifndef LM_GGML_USE_OPENMP
+// pause/resume must be called under mutex
+static void lm_ggml_threadpool_pause_locked(struct lm_ggml_threadpool * threadpool) {
+    LM_GGML_PRINT_DEBUG("Pausing threadpool\n");
+    threadpool->pause = true;
+    lm_ggml_cond_broadcast(&threadpool->cond);
+}
+static void lm_ggml_threadpool_resume_locked(struct lm_ggml_threadpool * threadpool) {
+    LM_GGML_PRINT_DEBUG("Resuming threadpool\n");
+    threadpool->pause = false;
+    lm_ggml_cond_broadcast(&threadpool->cond);
+}
+#endif
+void lm_ggml_threadpool_pause(struct lm_ggml_threadpool * threadpool) {
+#ifndef LM_GGML_USE_OPENMP
+    lm_ggml_mutex_lock(&threadpool->mutex);
+    if (!threadpool->pause) {
+       lm_ggml_threadpool_pause_locked(threadpool);
+    }
+    lm_ggml_mutex_unlock(&threadpool->mutex);
+#else
+    UNUSED(threadpool);
+#endif
+}
+void lm_ggml_threadpool_resume(struct lm_ggml_threadpool * threadpool) {
+#ifndef LM_GGML_USE_OPENMP
+    lm_ggml_mutex_lock(&threadpool->mutex);
+    if (threadpool->pause) {
+       lm_ggml_threadpool_resume_locked(threadpool);
+    }
+    lm_ggml_mutex_unlock(&threadpool->mutex);
+#else
+    UNUSED(threadpool);
+#endif
+}
+struct lm_ggml_cplan lm_ggml_graph_plan(
+          const struct lm_ggml_cgraph * cgraph,
+                           int       n_threads,
+    struct lm_ggml_threadpool * threadpool) {
+    if (threadpool == NULL) {
+        LM_GGML_PRINT_DEBUG("Threadpool is not specified. Will create a disposable threadpool : n_threads %d\n", n_threads);
+    }
     if (n_threads <= 0) {
-        n_threads = LM_GGML_DEFAULT_N_THREADS;
+        n_threads = threadpool ? threadpool->n_threads_max : LM_GGML_DEFAULT_N_THREADS;
     }
     size_t work_size = 0;
@@ -18761,12 +19873,13 @@ struct lm_ggml_cplan lm_ggml_graph_plan(const struct lm_ggml_cgraph * cgraph, in
     }
     if (work_size > 0) {
-        work_size += CACHE_LINE_SIZE*(n_threads - 1);
+        work_size += CACHE_LINE_SIZE*(n_threads);
     }
-    cplan.n_threads = MIN(max_tasks, n_threads);
-    cplan.work_size = work_size;
-    cplan.work_data = NULL;
+    cplan.threadpool = threadpool;
+    cplan.n_threads  = MIN(max_tasks, n_threads);
+    cplan.work_size  = work_size;
+    cplan.work_data  = NULL;
     return cplan;
 }
@@ -18774,17 +19887,17 @@ struct lm_ggml_cplan lm_ggml_graph_plan(const struct lm_ggml_cgraph * cgraph, in
 static thread_ret_t lm_ggml_graph_compute_thread(void * data) {
     struct lm_ggml_compute_state * state = (struct lm_ggml_compute_state *) data;
-    const struct lm_ggml_cgraph * cgraph = state->shared->cgraph;
-    const struct lm_ggml_cplan  * cplan  = state->shared->cplan;
+    const struct lm_ggml_cgraph * cgraph = state->threadpool->cgraph;
+    const struct lm_ggml_cplan  * cplan  = state->threadpool->cplan;
     set_numa_thread_affinity(state->ith);
     struct lm_ggml_compute_params params = {
-        /*.ith   =*/ state->ith,
-        /*.nth   =*/ state->shared->n_threads,
-        /*.wsize =*/ cplan->work_size,
-        /*.wdata =*/ cplan->work_data,
-        /*.shared=*/ state->shared,
+        /*.ith       =*/ state->ith,
+        /*.nth       =*/ state->threadpool->n_threads_cur,
+        /*.wsize     =*/ cplan->work_size,
+        /*.wdata     =*/ cplan->work_data,
+        /*.threadpool=*/ state->threadpool,
     };
     for (int node_n = 0; node_n < cgraph->n_nodes; node_n++) {
@@ -18793,12 +19906,12 @@ static thread_ret_t lm_ggml_graph_compute_thread(void * data) {
         lm_ggml_compute_forward(&params, node);
         if (state->ith == 0 && cplan->abort_callback && cplan->abort_callback(cplan->abort_callback_data)) {
-            state->shared->ec = LM_GGML_STATUS_ABORTED;
+            state->threadpool->ec = LM_GGML_STATUS_ABORTED;
         }
-        lm_ggml_barrier(state->shared);
+        lm_ggml_barrier(state->threadpool);
-        if (state->shared->ec != LM_GGML_STATUS_SUCCESS) {
+        if (state->threadpool->ec != LM_GGML_STATUS_SUCCESS) {
             break;
         }
     }
@@ -18806,24 +19919,243 @@ static thread_ret_t lm_ggml_graph_compute_thread(void * data) {
     return 0;
 }
+#ifndef LM_GGML_USE_OPENMP
+static inline bool lm_ggml_graph_compute_ready(struct lm_ggml_compute_state * state) {
+    struct lm_ggml_threadpool * threadpool = state->threadpool;
+    if (state->pending || threadpool->stop || threadpool->pause) { return true; }
+    // check for new graph/work
+    int new_graph = atomic_load_explicit(&threadpool->n_graph, memory_order_relaxed);
+    if (new_graph != state->last_graph) {
+        state->pending    = (state->ith < threadpool->n_threads_cur);
+        state->last_graph = new_graph;
+    }
+    return state->pending;
+}
+static inline bool lm_ggml_graph_compute_poll_for_work(struct lm_ggml_compute_state * state) {
+    struct lm_ggml_threadpool * threadpool = state->threadpool;
+    // This seems to make 0 ... 100 a decent range for polling level across modern processors.
+    // Perhaps, we can adjust it dynamically based on load and things.
+    const uint64_t n_rounds = 1024UL * 128 * threadpool->poll;
+    for (uint64_t i=0; !lm_ggml_graph_compute_ready(state) && i<n_rounds; i++) {
+        // No new work. Keep polling.
+        lm_ggml_thread_cpu_relax();
+    }
+    return state->pending;
+}
+static inline bool lm_ggml_graph_compute_check_for_work(struct lm_ggml_compute_state * state) {
+    struct lm_ggml_threadpool * threadpool = state->threadpool;
+    if (lm_ggml_graph_compute_poll_for_work(state)) {
+        return state->pending;
+    }
+    lm_ggml_mutex_lock_shared(&threadpool->mutex);
+    while (!lm_ggml_graph_compute_ready(state)) {
+        // No new work. Wait for the signal.
+        LM_GGML_PRINT_DEBUG("thread #%d waiting for work\n", state->ith);
+        lm_ggml_cond_wait(&threadpool->cond, &threadpool->mutex);
+    }
+    lm_ggml_mutex_unlock_shared(&threadpool->mutex);
+    return state->pending;
+}
+static thread_ret_t lm_ggml_graph_compute_secondary_thread(void* data) {
+    struct lm_ggml_compute_state * state = (struct lm_ggml_compute_state *) data;
+    struct lm_ggml_threadpool * threadpool = state->threadpool;
+    lm_ggml_thread_apply_priority(threadpool->prio);
+    if (lm_ggml_thread_cpumask_is_valid(state->cpumask)) {
+        lm_ggml_thread_apply_affinity(state->cpumask);
+    }
+    while (true) {
+        // Check if we need to sleep
+        while (threadpool->pause) {
+            LM_GGML_PRINT_DEBUG("thread #%d inside pause loop\n", state->ith);
+            lm_ggml_mutex_lock_shared(&threadpool->mutex);
+            if (threadpool->pause) {
+                lm_ggml_cond_wait(&threadpool->cond, &threadpool->mutex);
+            }
+            LM_GGML_PRINT_DEBUG("thread #%d resuming after wait\n", state->ith);
+            lm_ggml_mutex_unlock_shared(&threadpool->mutex);
+        }
+        // This needs to be checked for after the cond_wait
+        if (threadpool->stop) break;
+        // Check if there is new work
+        // The main thread is the only one that can dispatch new work
+        lm_ggml_graph_compute_check_for_work(state);
+        if (state->pending) {
+            state->pending = false;
+            lm_ggml_graph_compute_thread(state);
+        }
+    }
+    return (thread_ret_t) 0;
+}
+// Start processing new graph
+static void lm_ggml_graph_compute_kickoff(struct lm_ggml_threadpool * threadpool)
+{
+    // always take the mutex here because the worker threads are doing hybrid poll/wait
+    lm_ggml_mutex_lock(&threadpool->mutex);
+    atomic_fetch_add_explicit(&threadpool->n_graph, 1, memory_order_relaxed);
+    if (threadpool->pause) {
+       // Update main thread prio and affinity to match the threadpool settings
+       lm_ggml_thread_apply_priority(threadpool->prio);
+       if (lm_ggml_thread_cpumask_is_valid(threadpool->workers[0].cpumask)) {
+           lm_ggml_thread_apply_affinity(threadpool->workers[0].cpumask);
+       }
+       // resume does cond broadcast
+       lm_ggml_threadpool_resume_locked(threadpool);
+    } else {
+       lm_ggml_cond_broadcast(&threadpool->cond);
+    }
+    lm_ggml_mutex_unlock(&threadpool->mutex);
+}
+#endif // LM_GGML_USE_OPENMP
+void lm_ggml_threadpool_params_init(struct lm_ggml_threadpool_params * p, int n_threads) {
+    p->n_threads  = n_threads;
+    p->prio       = 0;     // default priority (usually means normal or inherited)
+    p->poll       = 50;    // hybrid-polling enabled
+    p->strict_cpu = false; // no strict placement (all threads share same cpumask)
+    p->paused     = false; // threads are ready to go
+    memset(p->cpumask, 0, LM_GGML_MAX_N_THREADS); // all-zero means use the default affinity (usually inherited)
+}
+struct lm_ggml_threadpool_params lm_ggml_threadpool_params_default(int n_threads) {
+    struct lm_ggml_threadpool_params p;
+    lm_ggml_threadpool_params_init(&p, n_threads);
+    return p;
+}
+bool lm_ggml_threadpool_params_match(const struct lm_ggml_threadpool_params * p0, const struct lm_ggml_threadpool_params * p1) {
+    if (p0->n_threads      != p1->n_threads  )    return false;
+    if (p0->prio           != p1->prio       )    return false;
+    if (p0->poll           != p1->poll       )    return false;
+    if (p0->strict_cpu     != p1->strict_cpu )    return false;
+    return memcmp(p0->cpumask, p1->cpumask, LM_GGML_MAX_N_THREADS) == 0;
+}
+static struct lm_ggml_threadpool * lm_ggml_threadpool_new_impl(
+    struct lm_ggml_threadpool_params * tpp,
+               struct lm_ggml_cgraph * cgraph,
+                struct lm_ggml_cplan * cplan) {
+    struct lm_ggml_threadpool * threadpool =
+        LM_GGML_ALIGNED_MALLOC(sizeof(struct lm_ggml_threadpool));
+    {
+        threadpool->cgraph           = cgraph;
+        threadpool->cplan            = cplan;
+        threadpool->n_graph          = 0;
+        threadpool->n_barrier        = 0;
+        threadpool->n_barrier_passed = 0;
+        threadpool->current_chunk    = 0;
+        threadpool->stop             = false;
+        threadpool->pause            = tpp->paused;
+        threadpool->workers          = NULL;
+        threadpool->n_threads_max    = tpp->n_threads;
+        threadpool->n_threads_cur    = tpp->n_threads;
+        threadpool->poll             = tpp->poll;
+        threadpool->prio             = tpp->prio;
+        threadpool->ec               = LM_GGML_STATUS_SUCCESS;
+    }
+    // Allocate and init workers state
+    const size_t workers_size = sizeof(struct lm_ggml_compute_state) * tpp->n_threads;
+    struct lm_ggml_compute_state * workers = LM_GGML_ALIGNED_MALLOC(workers_size);
+    memset(workers, 0, workers_size);
+    for (int j = 0; j < tpp->n_threads; j++) {
+        workers[j].threadpool = threadpool;
+        workers[j].ith        = j;
+    }
+    threadpool->workers = workers;
+#ifndef LM_GGML_USE_OPENMP
+    lm_ggml_mutex_init(&threadpool->mutex);
+    lm_ggml_cond_init(&threadpool->cond);
+    // Spin the threads for all workers, and update CPU placements.
+    // Place the main thread last (towards the higher numbered CPU cores).
+    int32_t cpumask_iter = 0;
+    for (int j = 1; j < tpp->n_threads; j++) {
+        lm_ggml_thread_cpumask_next(tpp->cpumask, workers[j].cpumask, tpp->strict_cpu, &cpumask_iter);
+        int32_t rc = lm_ggml_thread_create(&workers[j].thrd, NULL, lm_ggml_graph_compute_secondary_thread, &workers[j]);
+        LM_GGML_ASSERT(rc == 0);
+    }
+    lm_ggml_thread_cpumask_next(tpp->cpumask, workers[0].cpumask, tpp->strict_cpu, &cpumask_iter);
+    if (!threadpool->pause) {
+        // Update main thread prio and affinity at the start, otherwise we'll do it in resume
+        lm_ggml_thread_apply_priority(threadpool->prio);
+        if (lm_ggml_thread_cpumask_is_valid(threadpool->workers[0].cpumask)) {
+            lm_ggml_thread_apply_affinity(threadpool->workers[0].cpumask);
+        }
+    }
+#endif // LM_GGML_USE_OPENMP
+    return threadpool;
+}
+struct lm_ggml_threadpool * lm_ggml_threadpool_new(struct lm_ggml_threadpool_params * tpp) {
+    return lm_ggml_threadpool_new_impl(tpp, NULL, NULL);
+}
 enum lm_ggml_status lm_ggml_graph_compute(struct lm_ggml_cgraph * cgraph, struct lm_ggml_cplan * cplan) {
     LM_GGML_ASSERT(cplan);
     LM_GGML_ASSERT(cplan->n_threads > 0);
     LM_GGML_ASSERT(cplan->work_size == 0 || cplan->work_data != NULL);
-    int n_threads = cplan->n_threads;
-    struct lm_ggml_compute_state_shared state_shared = {
-        /*.cgraph                  =*/ cgraph,
-        /*.cgraph_plan             =*/ cplan,
-        /*.n_threads               =*/ n_threads,
-        /*.n_barrier               =*/ 0,
-        /*.n_barrier_passed        =*/ 0,
-        /*.abort_callback          =*/ NULL,
-        /*.abort_callback_data     =*/ NULL,
-        /*.current_chunk           =*/ 0,
-        /*.ec                      =*/ LM_GGML_STATUS_SUCCESS,
-    };
+    int n_threads                               = cplan->n_threads;
+    struct lm_ggml_threadpool * threadpool = cplan->threadpool;
+    bool disposable_threadpool = false;
+    if (threadpool == NULL) {
+        LM_GGML_PRINT_DEBUG("Threadpool is not specified. Will create a disposable threadpool : n_threads %d\n", n_threads);
+        disposable_threadpool = true;
+        struct lm_ggml_threadpool_params ttp = lm_ggml_threadpool_params_default(n_threads);
+        threadpool = lm_ggml_threadpool_new_impl(&ttp, cgraph, cplan);
+    } else {
+        // Reset some of the parameters that need resetting
+        // No worker threads should be accessing the parameters below at this stage
+        threadpool->cgraph           = cgraph;
+        threadpool->cplan            = cplan;
+        threadpool->n_threads_cur    = n_threads;
+        threadpool->current_chunk    = 0;
+        threadpool->ec               = LM_GGML_STATUS_SUCCESS;
+    }
+    if (n_threads > threadpool->n_threads_max) {
+        LM_GGML_PRINT("WARNING: cplan is requesting more threads than the threadpool contains. Expect a bad time!\n");
+    }
 #ifdef LM_GGML_USE_OPENMP
     if (n_threads > 1) {
@@ -18833,63 +20165,36 @@ enum lm_ggml_status lm_ggml_graph_compute(struct lm_ggml_cgraph * cgraph, struct
             {
                 // update the number of threads from the actual number of threads that we got from OpenMP
                 n_threads = omp_get_num_threads();
-                state_shared.n_threads = n_threads;
+                threadpool->n_threads_cur = n_threads;
             }
-            struct lm_ggml_compute_state worker = {
-                .thrd   = 0,
-                .ith    = omp_get_thread_num(),
-                .shared = &state_shared,
-            };
-            lm_ggml_graph_compute_thread(&worker);
+            lm_ggml_graph_compute_thread(&threadpool->workers[omp_get_thread_num()]);
         }
     } else {
-        struct lm_ggml_compute_state worker = {
-            .thrd   = 0,
-            .ith    = 0,
-            .shared = &state_shared,
-        };
-        lm_ggml_graph_compute_thread(&worker);
+        lm_ggml_graph_compute_thread(&threadpool->workers[0]);
     }
 #else
-    struct lm_ggml_compute_state * workers = alloca(sizeof(struct lm_ggml_compute_state)*n_threads);
+    // Kick all threads to start the new graph
+    lm_ggml_graph_compute_kickoff(threadpool);
-    for (int j = 0; j < n_threads; ++j) {
-        workers[j] = (struct lm_ggml_compute_state) {
-            .thrd   = 0,
-            .ith    = j,
-            .shared = &state_shared,
-        };
-    }
-    // create thread pool
-    for (int j = 1; j < n_threads; ++j) {
-        const int rc = lm_ggml_thread_create(&workers[j].thrd, NULL, lm_ggml_graph_compute_thread, &workers[j]);
-        LM_GGML_ASSERT(rc == 0);
-        UNUSED(rc);
-    }
-    // this is a work thread too
-    lm_ggml_graph_compute_thread(&workers[0]);
-    // join or kill thread pool
-    if (n_threads > 1) {
-        for (int j = 1; j < n_threads; j++) {
-            const int rc = lm_ggml_thread_join(workers[j].thrd, NULL);
-            LM_GGML_ASSERT(rc == 0);
-            UNUSED(rc);
-        }
-    }
+    // This is a work thread too
+    lm_ggml_graph_compute_thread(&threadpool->workers[0]);
 #endif
     // don't leave affinity set on the main thread
     clear_numa_thread_affinity();
-    return state_shared.ec;
+    enum lm_ggml_status ret = threadpool->ec;
+    if (disposable_threadpool) {
+        lm_ggml_threadpool_free(threadpool);
+    }
+    return ret;
 }
 enum lm_ggml_status lm_ggml_graph_compute_with_ctx(struct lm_ggml_context * ctx, struct lm_ggml_cgraph * cgraph, int n_threads) {
-    struct lm_ggml_cplan cplan = lm_ggml_graph_plan(cgraph, n_threads);
+    struct lm_ggml_cplan cplan = lm_ggml_graph_plan(cgraph, n_threads, NULL);
     struct lm_ggml_object * obj = lm_ggml_new_object(ctx, LM_GGML_OBJECT_TYPE_WORK_BUFFER, cplan.work_size);
@@ -19030,9 +20335,11 @@ void lm_ggml_graph_export(const struct lm_ggml_cgraph * cgraph, const char * fna
                 const uint32_t type   = tensor->type;
                 const uint32_t op     = tensor->op;
+                const int32_t  flags  = tensor->flags;
                 fwrite(&type,   sizeof(uint32_t), 1, fout);
                 fwrite(&op,     sizeof(uint32_t), 1, fout);
+                fwrite(&flags,  sizeof(int32_t),  1, fout);
                 for (int j = 0; j < LM_GGML_MAX_DIMS; ++j) {
                     const uint64_t ne = tensor->ne[j];
@@ -19062,9 +20369,11 @@ void lm_ggml_graph_export(const struct lm_ggml_cgraph * cgraph, const char * fna
                 const uint32_t type   = tensor->type;
                 const uint32_t op     = tensor->op;
+                const int32_t  flags  = tensor->flags;
                 fwrite(&type,   sizeof(uint32_t), 1, fout);
                 fwrite(&op,     sizeof(uint32_t), 1, fout);
+                fwrite(&flags,  sizeof(int32_t),  1, fout);
                 for (int j = 0; j < LM_GGML_MAX_DIMS; ++j) {
                     const uint64_t ne = tensor->ne[j];
@@ -19123,6 +20432,14 @@ void lm_ggml_graph_export(const struct lm_ggml_cgraph * cgraph, const char * fna
                         }
                     }
                 }
+                // dump the data
+                // TODO: pad this to 32 byte boundary
+                if ((flags & LM_GGML_TENSOR_FLAG_PARAM)) {
+                    const size_t size = lm_ggml_nbytes(tensor);
+                    fwrite(tensor->data, sizeof(char), size, fout);
+                }
             }
         }
@@ -19236,10 +20553,12 @@ struct lm_ggml_cgraph * lm_ggml_graph_import(const char * fname, struct lm_ggml_
         {
             uint32_t type;
             uint32_t op;
+            int32_t  flags;
             for (uint32_t i = 0; i < n_leafs; ++i) {
                 type   = *(const uint32_t *) ptr; ptr += sizeof(type);
                 op     = *(const uint32_t *) ptr; ptr += sizeof(op);
+                flags  = *(const int32_t  *) ptr; ptr += sizeof(flags);
                 int64_t ne[LM_GGML_MAX_DIMS];
                 size_t  nb[LM_GGML_MAX_DIMS];
@@ -19257,20 +20576,19 @@ struct lm_ggml_cgraph * lm_ggml_graph_import(const char * fname, struct lm_ggml_
                 struct lm_ggml_tensor * tensor = lm_ggml_new_tensor(*ctx_eval, (enum lm_ggml_type) type, LM_GGML_MAX_DIMS, ne);
-                tensor->op = (enum lm_ggml_op) op;
+                tensor->op    = (enum lm_ggml_op) op;
+                tensor->flags = flags;
                 memcpy(tensor->name,      ptr, LM_GGML_MAX_NAME);      ptr += LM_GGML_MAX_NAME;
                 memcpy(tensor->op_params, ptr, LM_GGML_MAX_OP_PARAMS); ptr += LM_GGML_MAX_OP_PARAMS;
-                tensor->data = (void *) ptr;
                 for (int j = 0; j < LM_GGML_MAX_DIMS; ++j) {
                     tensor->nb[j] = nb[j];
                 }
-                result->leafs[i] = tensor;
+                tensor->data = (void *) ptr; ptr += lm_ggml_nbytes(tensor);
-                ptr += lm_ggml_nbytes(tensor);
+                result->leafs[i] = tensor;
                 fprintf(stderr, "%s: loaded leaf %u: '%16s', %9zu bytes\n", __func__, i, tensor->name, lm_ggml_nbytes(tensor));
             }
@@ -19282,10 +20600,12 @@ struct lm_ggml_cgraph * lm_ggml_graph_import(const char * fname, struct lm_ggml_
         {
             uint32_t type;
             uint32_t op;
+            int32_t  flags;
             for (uint32_t i = 0; i < n_nodes; ++i) {
                 type   = *(const uint32_t *) ptr; ptr += sizeof(type);
                 op     = *(const uint32_t *) ptr; ptr += sizeof(op);
+                flags  = *(const int32_t  *) ptr; ptr += sizeof(flags);
                 enum lm_ggml_op eop = (enum lm_ggml_op) op;
@@ -19375,6 +20695,11 @@ struct lm_ggml_cgraph * lm_ggml_graph_import(const char * fname, struct lm_ggml_
                 result->nodes[i] = tensor;
+                // TODO tensor data is be duplicated due to lm_ggml_new_tensor call above
+                if (flags & LM_GGML_TENSOR_FLAG_PARAM) {
+                    tensor->data = (void *) ptr; ptr += lm_ggml_nbytes(tensor);
+                }
                 fprintf(stderr, "%s: loaded node %u: '%16s', %9zu bytes\n", __func__, i, tensor->name, lm_ggml_nbytes(tensor));
             }
         }
@@ -19643,6 +20968,7 @@ static enum lm_ggml_opt_result lm_ggml_opt_adam(
         lm_ggml_opt_callback callback,
         void * callback_data) {
     LM_GGML_ASSERT(lm_ggml_is_scalar(f));
+    LM_GGML_ASSERT(f->type == LM_GGML_TYPE_F32);
     // these will store the parameters we want to optimize
     struct lm_ggml_tensor * ps[LM_GGML_MAX_PARAMS];
@@ -19684,7 +21010,7 @@ static enum lm_ggml_opt_result lm_ggml_opt_adam(
     float * pf = params.past > 0 ? opt->adam.pf->data : NULL; // past function values
-    struct lm_ggml_cplan cplan = lm_ggml_graph_plan(gb, params.n_threads);
+    struct lm_ggml_cplan cplan = lm_ggml_graph_plan(gb, params.n_threads, NULL);
     struct lm_ggml_object * obj = lm_ggml_new_object(ctx, LM_GGML_OBJECT_TYPE_WORK_BUFFER, cplan.work_size);
     cplan.work_data = (uint8_t *)ctx->mem_buffer + obj->offs;
@@ -20031,7 +21357,7 @@ static enum lm_ggml_opt_result lm_ggml_opt_lbfgs(
         opt->iter = iter;
     }
-    struct lm_ggml_cplan cplan = lm_ggml_graph_plan(gb, params.n_threads);
+    struct lm_ggml_cplan cplan = lm_ggml_graph_plan(gb, params.n_threads, NULL);
     struct lm_ggml_object * obj = lm_ggml_new_object(ctx, LM_GGML_OBJECT_TYPE_WORK_BUFFER, cplan.work_size);
     cplan.work_data = (uint8_t *)ctx->mem_buffer + obj->offs;
@@ -20409,6 +21735,8 @@ enum lm_ggml_opt_result lm_ggml_opt(
         struct lm_ggml_context * ctx,
         struct lm_ggml_opt_params params,
         struct lm_ggml_tensor * f) {
+    LM_GGML_ASSERT(f->grad && "lm_ggml_set_param called for at least one parent tensor.");
     bool free_ctx = false;
     if (ctx == NULL) {
         struct lm_ggml_init_params params_ctx = {
@@ -20463,6 +21791,8 @@ enum lm_ggml_opt_result lm_ggml_opt_resume_g(
         lm_ggml_opt_callback callback,
         void * callback_data) {
+    LM_GGML_ASSERT(f->grad && "lm_ggml_set_param must be called for at least one ancestor");
     // build forward + backward compute graphs
     enum lm_ggml_opt_result result = LM_GGML_OPT_RESULT_OK;
@@ -20574,6 +21904,8 @@ size_t lm_ggml_quantize_chunk(
         case LM_GGML_TYPE_Q4_K:    result = quantize_q4_K(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
         case LM_GGML_TYPE_Q5_K:    result = quantize_q5_K(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
         case LM_GGML_TYPE_Q6_K:    result = quantize_q6_K(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
+        case LM_GGML_TYPE_TQ1_0:   result = quantize_tq1_0(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
+        case LM_GGML_TYPE_TQ2_0:   result = quantize_tq2_0(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
         case LM_GGML_TYPE_IQ2_XXS: result = quantize_iq2_xxs(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
         case LM_GGML_TYPE_IQ2_XS:  result = quantize_iq2_xs (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
         case LM_GGML_TYPE_IQ3_XXS: result = quantize_iq3_xxs(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
@@ -21550,6 +22882,7 @@ void lm_gguf_set_kv(struct lm_gguf_context * ctx, struct lm_gguf_context * src)
 void lm_gguf_add_tensor(
              struct lm_gguf_context * ctx,
         const struct lm_ggml_tensor * tensor) {
+    LM_GGML_ASSERT(tensor);
     if (lm_gguf_find_tensor(ctx, tensor->name) != -1) {
         LM_GGML_ABORT("duplicated tensor name");
     }