npm - @fugood/llama.node - Versions diffs - 1.0.0-beta.4 → 1.0.0-beta.6 - Mend

@fugood/llama.node 1.0.0-beta.4 → 1.0.0-beta.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (111) hide show

package/CMakeLists.txt +7 -4
package/lib/binding.ts +1 -1
package/package.json +14 -14
package/scripts/llama.cpp.patch +27 -26
package/src/LlamaCompletionWorker.cpp +21 -4
package/src/LlamaCompletionWorker.h +2 -0
package/src/LlamaContext.cpp +3 -12
package/src/common.hpp +6 -5
package/src/llama.cpp/CMakeLists.txt +15 -4
package/src/llama.cpp/common/CMakeLists.txt +15 -24
package/src/llama.cpp/common/arg.cpp +172 -110
package/src/llama.cpp/common/chat-parser.cpp +385 -0
package/src/llama.cpp/common/chat-parser.h +120 -0
package/src/llama.cpp/common/chat.cpp +726 -596
package/src/llama.cpp/common/chat.h +74 -8
package/src/llama.cpp/common/common.cpp +56 -38
package/src/llama.cpp/common/common.h +9 -3
package/src/llama.cpp/common/json-partial.cpp +256 -0
package/src/llama.cpp/common/json-partial.h +38 -0
package/src/llama.cpp/common/json-schema-to-grammar.cpp +2 -1
package/src/llama.cpp/common/json-schema-to-grammar.h +4 -4
package/src/llama.cpp/common/sampling.cpp +7 -8
package/src/llama.cpp/common/speculative.cpp +6 -4
package/src/llama.cpp/ggml/CMakeLists.txt +48 -3
package/src/llama.cpp/ggml/include/ggml.h +22 -3
package/src/llama.cpp/ggml/src/CMakeLists.txt +81 -22
package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +131 -49
package/src/llama.cpp/ggml/src/ggml-cpu/amx/amx.cpp +1 -1
package/src/llama.cpp/ggml/src/ggml-cpu/amx/mmq.cpp +1 -1
package/src/llama.cpp/ggml/src/ggml-cpu/arch/arm/cpu-feats.cpp +94 -0
package/src/llama.cpp/ggml/src/ggml-cpu/arch/arm/quants.c +4113 -0
package/src/llama.cpp/ggml/src/ggml-cpu/arch/arm/repack.cpp +2162 -0
package/src/llama.cpp/ggml/src/ggml-cpu/arch/loongarch/quants.c +2638 -0
package/src/llama.cpp/ggml/src/ggml-cpu/arch/powerpc/cpu-feats.cpp +82 -0
package/src/llama.cpp/ggml/src/ggml-cpu/arch/powerpc/quants.c +2731 -0
package/src/llama.cpp/ggml/src/ggml-cpu/arch/riscv/quants.c +2068 -0
package/src/llama.cpp/ggml/src/ggml-cpu/arch/riscv/repack.cpp +396 -0
package/src/llama.cpp/ggml/src/ggml-cpu/arch/s390/quants.c +1299 -0
package/src/llama.cpp/ggml/src/ggml-cpu/arch/wasm/quants.c +1480 -0
package/src/llama.cpp/ggml/src/ggml-cpu/arch/x86/quants.c +4310 -0
package/src/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-aarch64.cpp → arch/x86/repack.cpp} +59 -3206
package/src/llama.cpp/ggml/src/ggml-cpu/arch-fallback.h +184 -0
package/src/llama.cpp/ggml/src/ggml-cpu/common.h +1 -1
package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +12 -13
package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +64 -88
package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +8 -8
package/src/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-hbm.cpp → hbm.cpp} +1 -1
package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +1 -1
package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +56 -7
package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.h +5 -0
package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +282 -100
package/src/llama.cpp/ggml/src/ggml-cpu/ops.h +1 -0
package/src/llama.cpp/ggml/src/ggml-cpu/quants.c +1157 -0
package/src/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-quants.h → quants.h} +26 -0
package/src/llama.cpp/ggml/src/ggml-cpu/repack.cpp +1570 -0
package/src/llama.cpp/ggml/src/ggml-cpu/repack.h +98 -0
package/src/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +119 -5
package/src/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-traits.cpp → traits.cpp} +1 -1
package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +85 -16
package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +204 -49
package/src/llama.cpp/include/llama.h +145 -40
package/src/llama.cpp/src/CMakeLists.txt +5 -1
package/src/llama.cpp/src/llama-arch.cpp +99 -3
package/src/llama.cpp/src/llama-arch.h +10 -1
package/src/llama.cpp/src/llama-batch.cpp +728 -272
package/src/llama.cpp/src/llama-batch.h +112 -54
package/src/llama.cpp/src/llama-chat.cpp +19 -2
package/src/llama.cpp/src/llama-chat.h +1 -0
package/src/llama.cpp/src/llama-context.cpp +525 -339
package/src/llama.cpp/src/llama-context.h +38 -17
package/src/llama.cpp/src/llama-cparams.cpp +4 -0
package/src/llama.cpp/src/llama-cparams.h +2 -0
package/src/llama.cpp/src/llama-grammar.cpp +12 -2
package/src/llama.cpp/src/llama-graph.cpp +413 -353
package/src/llama.cpp/src/llama-graph.h +112 -56
package/src/llama.cpp/src/llama-hparams.cpp +10 -2
package/src/llama.cpp/src/llama-hparams.h +13 -2
package/src/llama.cpp/src/llama-kv-cache-unified-iswa.cpp +279 -0
package/src/llama.cpp/src/llama-kv-cache-unified-iswa.h +128 -0
package/src/llama.cpp/src/llama-kv-cache-unified.cpp +1815 -0
package/src/llama.cpp/src/llama-kv-cache-unified.h +303 -0
package/src/llama.cpp/src/llama-kv-cells.h +415 -0
package/src/llama.cpp/src/llama-memory-hybrid.cpp +246 -0
package/src/llama.cpp/src/llama-memory-hybrid.h +138 -0
package/src/llama.cpp/src/llama-memory-recurrent.cpp +1112 -0
package/src/llama.cpp/src/llama-memory-recurrent.h +183 -0
package/src/llama.cpp/src/llama-memory.cpp +41 -0
package/src/llama.cpp/src/llama-memory.h +86 -5
package/src/llama.cpp/src/llama-mmap.cpp +1 -1
package/src/llama.cpp/src/llama-model-loader.cpp +42 -17
package/src/llama.cpp/src/llama-model-saver.cpp +1 -0
package/src/llama.cpp/src/llama-model.cpp +1137 -528
package/src/llama.cpp/src/llama-model.h +4 -0
package/src/llama.cpp/src/llama-quant.cpp +2 -1
package/src/llama.cpp/src/llama-sampling.cpp +2 -2
package/src/llama.cpp/src/llama-vocab.cpp +69 -32
package/src/llama.cpp/src/llama-vocab.h +1 -0
package/src/llama.cpp/src/llama.cpp +11 -7
package/src/llama.cpp/src/unicode.cpp +5 -0
package/src/tts_utils.h +1 -1
package/src/llama.cpp/common/json.hpp +0 -24766
package/src/llama.cpp/common/minja/chat-template.hpp +0 -541
package/src/llama.cpp/common/minja/minja.hpp +0 -2974
package/src/llama.cpp/common/stb_image.h +0 -7988
package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.h +0 -8
package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +0 -13326
package/src/llama.cpp/src/llama-kv-cache.cpp +0 -2827
package/src/llama.cpp/src/llama-kv-cache.h +0 -515
/package/src/llama.cpp/ggml/src/ggml-cpu/{cpu-feats-x86.cpp → arch/x86/cpu-feats.cpp} +0 -0
/package/src/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-hbm.h → hbm.h} +0 -0
/package/src/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-traits.h → traits.h} +0 -0

package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp CHANGED Viewed

@@ -53,7 +53,6 @@
 #include "ggml-cpu-impl.h"
 #include "ggml-quants.h"
-#include <atomic>
 #include <array>
 #include <type_traits>
@@ -63,7 +62,7 @@
 #define NOINLINE __attribute__((__noinline__))
 #endif
-#if defined(__ARM_NEON) || defined(__AVX512F__)
+#if defined(__ARM_NEON) || defined(__AVX512F__) || defined(__VXE__) || defined(__VXE2__)
 #define VECTOR_REGISTERS 32
 #else
 #define VECTOR_REGISTERS 16
@@ -110,6 +109,12 @@ inline float16x8_t sub(float16x8_t x, float16x8_t y) { return vsubq_f16(x, y); }
 inline float16x8_t mul(float16x8_t x, float16x8_t y) { return vmulq_f16(x, y); }
 #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+#if defined(__VXE__) || defined(__VXE2__)
+inline float32x4_t add(float32x4_t x, float32x4_t y) { return vec_add(x, y); }
+inline float32x4_t sub(float32x4_t x, float32x4_t y) { return vec_sub(x, y); }
+inline float32x4_t mul(float32x4_t x, float32x4_t y) { return vec_mul(x, y); }
+#endif
 #if defined(__MMA__)
 typedef vector unsigned char vec_t;
 typedef __vector_quad acc_t;
@@ -163,6 +168,13 @@ inline float16x8_t madd(float16x8_t a, float16x8_t b, float16x8_t c) {
 #endif
 #endif
+#if defined(__VXE__) || defined(__VXE2__)
+template <>
+inline float32x4_t madd(float32x4_t a, float32x4_t b, float32x4_t c) {
+    return vec_madd(a, b, c);
+}
+#endif
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 // VECTORIZED HORIZONTAL SUM
@@ -179,6 +191,13 @@ inline float hsum(float16x8_t x) {
 }
 #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+#if defined(__VXE__) || defined(__VXE2__)
+inline float hsum(float32x4_t x) {
+    float32x4_t tmp = x + vec_reve(x);
+    return tmp[0] + tmp[1];
+}
+#endif
 #if defined(__SSE__) || defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__)
 inline float hsum(__m128 x) {
 #if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__)
@@ -228,6 +247,21 @@ template <> inline float32x4_t load(const ggml_fp16_t *p) {
 #endif // _MSC_VER
 #endif // __ARM_NEON
+#if defined(__VXE__) || defined(__VXE2__)
+template <> inline float32x4_t load(const ggml_fp16_t * p) {
+    float tmp[4];
+    for (int i = 0; i < 4; i++) {
+        tmp[i] = GGML_FP16_TO_FP32(p[i]);
+    }
+    return vec_xl(0, (const float *)(tmp));
+}
+template <> inline float32x4_t load(const float * p) {
+    return vec_xl(0, p);
+}
+#endif
 #if defined(__SSE__) || defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__)
 template <> inline __m128 load(const float *p) {
     return _mm_loadu_ps(p);
@@ -394,8 +428,6 @@ class tinyBLAS {
     template <int RM, int RN, int BM>
     NOINLINE void gemm(int64_t m, int64_t n, int64_t BN) {
-        static std::atomic<int64_t> current_chunk;
         GGML_ASSERT(m % (RM * BM) == 0);
         const int64_t ytiles = m / (RM * BM);
         const int64_t xtiles = (n + RN -1) / RN;
@@ -410,7 +442,7 @@ class tinyBLAS {
         if (params->ith == 0) {
             GGML_ASSERT( jj_BN * SIZE_BN + (NB_BN - jj_BN) * (SIZE_BN - 1) == xtiles);
             // Every thread starts at ith, so the first unprocessed chunk is nth.  This save a bit of coordination right at the start.
-            std::atomic_store_explicit(&current_chunk, (int64_t)params->nth, std::memory_order_relaxed);
+            ggml_threadpool_chunk_set(params->threadpool, params->nth);
         }
         ggml_barrier(params->threadpool);
@@ -439,8 +471,7 @@ class tinyBLAS {
                 GGML_ASSERT(jj == jj2);
             }
-            // next step.
-            job = std::atomic_fetch_add_explicit(&current_chunk, (int64_t)1, std::memory_order_relaxed);
+            job = ggml_threadpool_chunk_add(params->threadpool, 1);
         }
         ggml_barrier(params->threadpool);
@@ -3323,6 +3354,14 @@ bool llamafile_sgemm(const struct ggml_compute_params * params, int64_t m, int64
             (const float *)B, ldb,
             (float *)C, ldc};
         return tb.matmul(m, n);
+#elif defined(__VXE__) || defined(__VXE2__)
+        if (n < 4)
+            return false;
+        tinyBLAS<4, float32x4_t, float32x4_t, float, float, float> tb{ params,
+            k, (const float *)A, lda,
+            (const float *)B, ldb,
+            (float *)C, ldc};
+        return tb.matmul(m, n);
 #elif defined(__MMA__)
         if (k % 8)
             return false;
@@ -3414,6 +3453,16 @@ bool llamafile_sgemm(const struct ggml_compute_params * params, int64_t m, int64
                 (float *)C, ldc};
             return tb.matmul(m, n);
         }
+#elif defined(__VXE__) || defined(__VXE2__)
+        if (n < 4)
+            return false;
+        if (Btype == GGML_TYPE_F16) {
+            tinyBLAS<4, float32x4_t, float32x4_t, ggml_fp16_t, ggml_fp16_t, float> tb{ params,
+                k, (const ggml_fp16_t *)A, lda,
+                (const ggml_fp16_t *)B, ldb,
+                (float *)C, ldc};
+            return tb.matmul(m, n);
+        }
 #endif
         return false;
     }

package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.h CHANGED Viewed

@@ -1,6 +1,11 @@
 #pragma once
 #include <stdint.h>
 #include <stdbool.h>
+#if defined(__VXE__) || defined(__VXE2__)
+#include <vecintrin.h>
+#endif
 #ifdef __cplusplus
 extern "C" {
 #endif

package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp CHANGED Viewed

@@ -6793,6 +6793,73 @@ void ggml_compute_forward_pad_reflect_1d(
     }
 }
+// ggml_compute_forward_roll
+static int64_t ggml_wrap_index(int64_t i, int64_t ne) {
+    if (i < 0) {
+        return i + ne;
+    } else if (i >= ne) {
+        return i - ne;
+    }
+    return i;
+}
+static void ggml_compute_forward_roll_f32(
+        const ggml_compute_params * params,
+        ggml_tensor * dst) {
+    const ggml_tensor * src0 = dst->src[0];
+    const float * src_data = (const float *) src0->data;
+    float * dst_data = (float *) dst->data;
+    GGML_TENSOR_UNARY_OP_LOCALS
+    const int s0 = ggml_get_op_params_i32(dst, 0);
+    const int s1 = ggml_get_op_params_i32(dst, 1);
+    const int s2 = ggml_get_op_params_i32(dst, 2);
+    const int s3 = ggml_get_op_params_i32(dst, 3);
+    const int64_t total = ne1 * ne2 * ne3;
+    const int64_t per_thread = (total + params->nth) / params->nth;
+    const int64_t start = params->ith * per_thread;
+    const int64_t end   = std::min(start + per_thread, total);
+    for (int64_t i = start; i < end; ++i) {
+        const int64_t i1 = i % ne1;
+        const int64_t i2 = (i / ne1) % ne2;
+        const int64_t i3 = i / (ne2 * ne1);
+        float * dst_row = dst_data + (i3*nb3 + i2*nb2 + i1*nb1) / sizeof(float);
+        const int64_t i01 = ggml_wrap_index(i1 - s1, ne01);
+        const int64_t i02 = ggml_wrap_index(i2 - s2, ne02);
+        const int64_t i03 = ggml_wrap_index(i3 - s3, ne03);
+        const float * src_row = src_data + (i03*nb03 + i02*nb02 + i01*nb01) / sizeof(float);
+        const int64_t s = ggml_wrap_index(-s0, ne00);
+        const int64_t n = ne00 - s;
+        ggml_vec_cpy_f32(n, dst_row,     src_row + s);
+        ggml_vec_cpy_f32(s, dst_row + n, src_row);
+    }
+}
+void ggml_compute_forward_roll(
+        const ggml_compute_params * params,
+        ggml_tensor * dst) {
+    const ggml_tensor * src0 = dst->src[0];
+    switch (src0->type) {
+        case GGML_TYPE_F32:
+            {
+                ggml_compute_forward_roll_f32(params, dst);
+            } break;
+        default:
+            {
+                GGML_ABORT("fatal error");
+            }
+    }
+}
 // ggml_compute_forward_arange
 static void ggml_compute_forward_arange_f32(
@@ -7633,39 +7700,83 @@ static void ggml_compute_forward_ssm_scan_f32(
     const int ir1 = MIN(ir0 + dr, nr);
     const int ir  = ir1 - ir0;
-    for (int i3 = 0; i3 < n_s; ++i3) {
-        for (int i2 = 0; i2 < n_t; ++i2) {
-            const float * s0 = (const float *) ((const char *) src0->data + ir0*(src0->nb[1]) + i3*(src0->nb[2])); // {d_state, d_inner, n_s}
-            const float * x  = (const float *) ((const char *) src1->data + ir0*(src1->nb[0]) + i2*(src1->nb[1]) + i3*(src1->nb[2])); // {d_inner, n_t, n_s}
-            const float * dt = (const float *) ((const char *) src2->data + ir0*(src2->nb[0]) + i2*(src2->nb[1]) + i3*(src2->nb[2])); // {d_inner, n_t, n_s}
-            const float * A  = (const float *) ((const char *) src3->data + ir0*(src3->nb[1])); // {d_state, d_inner}
-            const float * B  = (const float *) ((const char *) src4->data +  i2*(src4->nb[1]) + i3*(src4->nb[2])); // {d_state, n_t, n_s}
-            const float * C  = (const float *) ((const char *) src5->data +  i2*(src5->nb[1]) + i3*(src5->nb[2])); // {d_state, n_t, n_s}
-                  float * y  = (      float *) ((      char *) dst->data  + ir0*(src1->nb[0]) + i2*(src1->nb[1]) + i3*(src1->nb[2])); // {d_inner, n_t, n_s}
-                  float * s  = (      float *) ((      char *) dst->data  + ir0*(src0->nb[1]) + i3*(src0->nb[2]) +     src1->nb[3]);  // {d_state, d_inner, n_s}
-            // use the output as the source for the next token-wise iterations
-            if (i2 > 0) { s0 = s; }
-            // d_inner
-            for (int i1 = 0; i1 < ir; ++i1) {
-                // ref: https://github.com/state-spaces/mamba/blob/34076d664838588a3c97727b263478ab9f621a07/mamba_ssm/ops/triton/selective_state_update.py#L78
-                float dt_soft_plus = dt[i1] <= 20.0f ? log1pf(expf(dt[i1])) : dt[i1];
-                float x_dt = x[i1] * dt_soft_plus;
-                float sumf = 0.0f;
-                // d_state
-                for (int i0 = 0; i0 < nc; ++i0) {
-                    int i = i0 + i1*nc;
-                    // state = prev_state * dA + dB * x
-                    float state = (s0[i] * expf(dt_soft_plus * A[i])) + (B[i0] * x_dt);
-                    // y = rowwise_dotprod(state, C)
-                    sumf += state * C[i0];
-                    s[i] = state;
+    #ifdef __ARM_FEATURE_SVE
+        for (int i3 = 0; i3 < n_s; ++i3) {
+            for (int i2 = 0; i2 < n_t; ++i2) {
+                const float * s0 = (const float *) ((const char *) src0->data + ir0*(src0->nb[1]) + i3*(src0->nb[2])); // {d_state, d_inner, n_s}
+                const float * x  = (const float *) ((const char *) src1->data + ir0*(src1->nb[0]) + i2*(src1->nb[1]) + i3*(src1->nb[2])); // {d_inner, n_t, n_s}
+                const float * dt = (const float *) ((const char *) src2->data + ir0*(src2->nb[0]) + i2*(src2->nb[1]) + i3*(src2->nb[2])); // {d_inner, n_t, n_s}
+                const float * A  = (const float *) ((const char *) src3->data + ir0*(src3->nb[1])); // {d_state, d_inner}
+                const float * B  = (const float *) ((const char *) src4->data +  i2*(src4->nb[1]) + i3*(src4->nb[2])); // {d_state, n_t, n_s}
+                const float * C  = (const float *) ((const char *) src5->data +  i2*(src5->nb[1]) + i3*(src5->nb[2])); // {d_state, n_t, n_s}
+                    float * y  = (      float *) ((      char *) dst->data  + ir0*(src1->nb[0]) + i2*(src1->nb[1]) + i3*(src1->nb[2])); // {d_inner, n_t, n_s}
+                    float * s  = (      float *) ((      char *) dst->data  + ir0*(src0->nb[1]) + i3*(src0->nb[2]) +     src1->nb[3]);  // {d_state, d_inner, n_s}
+                // use the output as the source for the next token-wise iterations
+                if (i2 > 0) { s0 = s; }
+                // d_inner
+                for (int i1 = 0; i1 < ir; ++i1) {
+                    float dt_soft_plus = dt[i1] <= 20.0f ? log1pf(expf(dt[i1])) : dt[i1];
+                    float x_dt = x[i1] * dt_soft_plus;
+                    svfloat32_t vx_dt = GGML_F32_VEC_SET1(x_dt);
+                    svfloat32_t vdt_soft_plus = GGML_F32_VEC_SET1(dt_soft_plus);
+                    svfloat32_t r1_vector = GGML_F32_VEC_ZERO;
+                    for (int64_t k = 0; k < nc; k += svcntw()) {
+                        svfloat32_t vA = GGML_F32_VEC_LOAD(&A[i1*nc + k]);
+                        svfloat32_t vB = GGML_F32_VEC_LOAD(&B[k]);
+                        svfloat32_t vC = GGML_F32_VEC_LOAD(&C[k]);
+                        svfloat32_t vs0 = GGML_F32_VEC_LOAD(&s0[i1*nc + k]);
+                        svfloat32_t t1 = GGML_F32_VEC_MUL(vdt_soft_plus, vA);
+                        t1 = exp_ps_sve(svptrue_b32(), t1);
+                        svfloat32_t t2 = GGML_F32_VEC_MUL(vx_dt, vB);
+                        vs0 = GGML_F32_VEC_FMA(vs0, t1, t2);
+                        r1_vector = GGML_F32_VEC_ADD(GGML_F32_VEC_MUL(vs0, vC), r1_vector);
+                        GGML_F32_VEC_STORE(&s[i1*nc + k], vs0);
+                    }
+                    y[i1] = GGML_F32xt_REDUCE_ONE(r1_vector);
                 }
-                y[i1] = sumf;
             }
         }
-    }
+    #else
+        for (int i3 = 0; i3 < n_s; ++i3) {
+            for (int i2 = 0; i2 < n_t; ++i2) {
+                const float * s0 = (const float *) ((const char *) src0->data + ir0*(src0->nb[1]) + i3*(src0->nb[2])); // {d_state, d_inner, n_s}
+                const float * x  = (const float *) ((const char *) src1->data + ir0*(src1->nb[0]) + i2*(src1->nb[1]) + i3*(src1->nb[2])); // {d_inner, n_t, n_s}
+                const float * dt = (const float *) ((const char *) src2->data + ir0*(src2->nb[0]) + i2*(src2->nb[1]) + i3*(src2->nb[2])); // {d_inner, n_t, n_s}
+                const float * A  = (const float *) ((const char *) src3->data + ir0*(src3->nb[1])); // {d_state, d_inner}
+                const float * B  = (const float *) ((const char *) src4->data +  i2*(src4->nb[1]) + i3*(src4->nb[2])); // {d_state, n_t, n_s}
+                const float * C  = (const float *) ((const char *) src5->data +  i2*(src5->nb[1]) + i3*(src5->nb[2])); // {d_state, n_t, n_s}
+                    float * y  = (      float *) ((      char *) dst->data  + ir0*(src1->nb[0]) + i2*(src1->nb[1]) + i3*(src1->nb[2])); // {d_inner, n_t, n_s}
+                    float * s  = (      float *) ((      char *) dst->data  + ir0*(src0->nb[1]) + i3*(src0->nb[2]) +     src1->nb[3]);  // {d_state, d_inner, n_s}
+                // use the output as the source for the next token-wise iterations
+                if (i2 > 0) { s0 = s; }
+                // d_inner
+                for (int i1 = 0; i1 < ir; ++i1) {
+                    // ref: https://github.com/state-spaces/mamba/blob/34076d664838588a3c97727b263478ab9f621a07/mamba_ssm/ops/triton/selective_state_update.py#L78
+                    float dt_soft_plus = dt[i1] <= 20.0f ? log1pf(expf(dt[i1])) : dt[i1];
+                    float x_dt = x[i1] * dt_soft_plus;
+                    float sumf = 0.0f;
+                    // d_state
+                    for (int i0 = 0; i0 < nc; ++i0) {
+                        int i = i0 + i1*nc;
+                        // state = prev_state * dA + dB * x
+                        float state = (s0[i] * expf(dt_soft_plus * A[i])) + (B[i0] * x_dt);
+                        // y = rowwise_dotprod(state, C)
+                        sumf += state * C[i0];
+                        s[i] = state;
+                    }
+                    y[i1] = sumf;
+                }
+            }
+        }
+    #endif
 }
 void ggml_compute_forward_ssm_scan(
@@ -8070,6 +8181,14 @@ static void ggml_compute_forward_rwkv_wkv6_f32(
         #define GGML_F32X_MUL GGML_F32x16_MUL
         #define GGML_F32X_FMA GGML_F32x16_FMA
         #define WKV_VECTOR_SIZE 16
+    #elif defined(__ARM_FEATURE_SVE) && defined(__aarch64__)
+        #define GGML_F32X GGML_F32xt
+        #define GGML_F32X_SET1 GGML_F32xt_SET1
+        #define GGML_F32X_LOAD GGML_F32xt_LOAD
+        #define GGML_F32X_STORE GGML_F32xt_STORE
+        #define GGML_F32X_MUL GGML_F32xt_MUL
+        #define GGML_F32X_FMA GGML_F32xt_FMA
+        #define WKV_VECTOR_SIZE 8
     #elif defined(__ARM_NEON) && defined(__aarch64__)
         #define GGML_F32X GGML_F32x4
         #define GGML_F32X_SET1 GGML_F32x4_SET1
@@ -8081,7 +8200,13 @@ static void ggml_compute_forward_rwkv_wkv6_f32(
     #endif
     #ifdef WKV_VECTOR_SIZE
-        const int64_t vec_count = head_size / WKV_VECTOR_SIZE;
+        int wkv_vector_size;
+        #if defined(__ARM_FEATURE_SVE)
+            wkv_vector_size = svcntw();
+        #else
+            wkv_vector_size = WKV_VECTOR_SIZE;
+        #endif
+        const int64_t vec_count = head_size / wkv_vector_size;
         for (int64_t t = 0; t < T; t++) {
             size_t t_offset = t * t_stride;
@@ -8111,7 +8236,7 @@ static void ggml_compute_forward_rwkv_wkv6_f32(
                     GGML_F32X time_decay_vec = GGML_F32X_SET1(time_decay_val);
                     for (int64_t j = 0; j < vec_count; j++) {
-                        size_t base_j = j * WKV_VECTOR_SIZE;
+                        size_t base_j = j * wkv_vector_size;
                         size_t t_h_j_offset = t_h_offset + base_j;
                         size_t h_2d_i_j_offset = h_2d_i_offset + base_j;
@@ -8136,7 +8261,7 @@ static void ggml_compute_forward_rwkv_wkv6_f32(
                     }
                     // Handle remaining elements, this will not be used.
-                    for (int64_t j = vec_count * WKV_VECTOR_SIZE; j < head_size; j++) {
+                    for (int64_t j = vec_count * wkv_vector_size; j < head_size; j++) {
                         size_t t_h_j_offset = t_h_offset + j;
                         size_t h_2d_i_j_offset = h_2d_i_offset + j;
                         float v_val = v[t_h_j_offset];
@@ -8272,6 +8397,14 @@ static void ggml_compute_forward_gla_f32(
         #define GGML_F32X_MUL GGML_F32x16_MUL
         #define GGML_F32X_FMA GGML_F32x16_FMA
         #define GLA_VECTOR_SIZE 16
+    #elif defined(__ARM_FEATURE_SVE) && defined(__aarch64__)
+        #define GGML_F32X GGML_F32xt
+        #define GGML_F32X_SET1 GGML_F32xt_SET1
+        #define GGML_F32X_LOAD GGML_F32xt_LOAD
+        #define GGML_F32X_STORE GGML_F32xt_STORE
+        #define GGML_F32X_MUL GGML_F32xt_MUL
+        #define GGML_F32X_FMA GGML_F32xt_FMA
+        #define GLA_VECTOR_SIZE 8
     #elif defined(__ARM_NEON) && defined(__aarch64__)
         #define GGML_F32X GGML_F32x4
         #define GGML_F32X_SET1 GGML_F32x4_SET1
@@ -8283,7 +8416,13 @@ static void ggml_compute_forward_gla_f32(
     #endif
     #ifdef GLA_VECTOR_SIZE
-        const int64_t vec_count = head_size / GLA_VECTOR_SIZE;
+        int gla_vector_size;
+        #if defined(__ARM_FEATURE_SVE)
+            gla_vector_size = svcntw();
+        #else
+            gla_vector_size = GLA_VECTOR_SIZE;
+        #endif
+        const int64_t vec_count = head_size / gla_vector_size;
         for (int64_t t = 0; t < T; t++) {
             size_t t_offset = t * t_stride;
@@ -8310,7 +8449,7 @@ static void ggml_compute_forward_gla_f32(
                     GGML_F32X g_vec = GGML_F32X_SET1(g_val);
                     for (int64_t j = 0; j < vec_count; j++) {
-                        size_t base_j = j * GLA_VECTOR_SIZE;
+                        size_t base_j = j * gla_vector_size;
                         size_t t_h_j_offset = t_h_offset + base_j;
                         size_t h_2d_i_j_offset = h_2d_i_offset + base_j;
@@ -8334,7 +8473,7 @@ static void ggml_compute_forward_gla_f32(
                     }
                     // Handle remaining elements, this will not be used.
-                    for (int64_t j = vec_count * GLA_VECTOR_SIZE; j < head_size; j++) {
+                    for (int64_t j = vec_count * gla_vector_size; j < head_size; j++) {
                         size_t t_h_j_offset = t_h_offset + j;
                         size_t h_2d_i_j_offset = h_2d_i_offset + j;
                         float v_val = v[t_h_j_offset];
@@ -8443,83 +8582,126 @@ static void ggml_compute_forward_rwkv_wkv7_f32(
     int64_t h_stride_2d = head_size * head_size;
     #if defined(GGML_SIMD)
-        for (int64_t t = 0; t < T; t++) {
-            int64_t t_offset = t * t_stride;
-            int64_t state_offset = head_size * C * (t / (T / n_seqs));
-            float * state_cur = state + state_offset;
-            float * state_prev = t % (T / n_seqs) ? state_cur : (float*)dst->src[6]->data + state_offset;
-            for (int64_t h = h_start; h < h_end; h++) {
-                int64_t h_offset = h * h_stride;
-                int64_t t_h_offset = t_offset + h_offset;
-                int64_t h_2d_offset = h * h_stride_2d;
-                for (int64_t ii = 0; ii < head_size; ii++) {
-                    int64_t t_h_i_offset = t_h_offset + ii;
-                    int64_t h_2d_i_offset = h_2d_offset + ii * h_stride;
-                    GGML_F32_VEC v_vec = GGML_F32_VEC_SET1(v[t_h_i_offset]);
+        #if defined(__ARM_FEATURE_SVE)
+            // scalar Route to scalar implementation       //TODO: Write SVE code
+            for (int64_t t = 0; t < T; t++) {
+                int64_t t_offset = t * t_stride;
+                int64_t state_offset = head_size * C * (t / (T / n_seqs));
+                float * state_cur = state + state_offset;
+                float * state_prev = t % (T / n_seqs) ? state_cur : (float*)dst->src[6]->data + state_offset;
+                for (int64_t h = h_start; h < h_end; h++) {
+                    int64_t h_offset = h * h_stride;
+                    int64_t t_h_offset = t_offset + h_offset;
+                    int64_t h_2d_offset = h * h_stride_2d;
+                    for (int64_t i = 0; i < head_size; i++) {
+                        int64_t t_h_i_offset = t_h_offset + i;
+                        int64_t h_2d_i_offset = h_2d_offset + i * h_stride;
+                        float v_val = v[t_h_i_offset];
+                        float sa = 0, result = 0;
+                        for (int64_t j = 0; j < head_size; j++) {
+                            sa += a[t_h_offset + j] * state_prev[h_2d_i_offset + j];
+                        }
-                    float sa = 0;
-                    {
-                        GGML_F32_VEC sum[GGML_F32_ARR] = { GGML_F32_VEC_ZERO };
-                        GGML_F32_VEC ax[GGML_F32_ARR];
-                        GGML_F32_VEC ay[GGML_F32_ARR];
-                        for (int64_t j = 0; j < head_size; j += GGML_F32_STEP) {
-                            for (int64_t kk = 0; kk < GGML_F32_ARR; kk++) {
-                                ax[kk] = GGML_F32_VEC_LOAD(&a[t_h_offset + j + kk * GGML_F32_EPR]);
-                                ay[kk] = GGML_F32_VEC_LOAD(&state_prev[h_2d_i_offset + j + kk * GGML_F32_EPR]);
-                                sum[kk] = GGML_F32_VEC_FMA(sum[kk], ax[kk], ay[kk]);
-                            }
+                        for (int64_t j = 0; j < head_size; j++) {
+                            int64_t t_h_j_offset = t_h_offset + j;
+                            int64_t h_2d_i_j_offset = h_2d_i_offset + j;
+                            float r_val = r[t_h_j_offset];
+                            float w_val = w[t_h_j_offset];
+                            float k_val = k[t_h_j_offset];
+                            float b_val = b[t_h_j_offset];
+                            float kv_val = v_val * k_val;
+                            float prev_state_val = state_prev[h_2d_i_j_offset];
+                            state_cur[h_2d_i_j_offset] = prev_state_val * w_val + kv_val + sa * b_val;
+                            result += state_cur[h_2d_i_j_offset] * r_val;
                         }
-                        GGML_F32_VEC_REDUCE(sa, sum);
+                        dst_data[t_h_i_offset] = result;
                     }
+                }
+            }
+        #else
+            for (int64_t t = 0; t < T; t++) {
+                int64_t t_offset = t * t_stride;
+                int64_t state_offset = head_size * C * (t / (T / n_seqs));
+                float * state_cur = state + state_offset;
+                float * state_prev = t % (T / n_seqs) ? state_cur : (float*)dst->src[6]->data + state_offset;
+                for (int64_t h = h_start; h < h_end; h++) {
+                    int64_t h_offset = h * h_stride;
+                    int64_t t_h_offset = t_offset + h_offset;
+                    int64_t h_2d_offset = h * h_stride_2d;
+                    for (int64_t ii = 0; ii < head_size; ii++) {
+                        int64_t t_h_i_offset = t_h_offset + ii;
+                        int64_t h_2d_i_offset = h_2d_offset + ii * h_stride;
+                        GGML_F32_VEC v_vec = GGML_F32_VEC_SET1(v[t_h_i_offset]);
+                        float sa = 0;
+                        {
+                            GGML_F32_VEC sum[GGML_F32_ARR] = { GGML_F32_VEC_ZERO };
+                            GGML_F32_VEC ax[GGML_F32_ARR];
+                            GGML_F32_VEC ay[GGML_F32_ARR];
+                            for (int64_t j = 0; j < head_size; j += GGML_F32_STEP) {
+                                for (int64_t kk = 0; kk < GGML_F32_ARR; kk++) {
+                                    ax[kk] = GGML_F32_VEC_LOAD(&a[t_h_offset + j + kk * GGML_F32_EPR]);
+                                    ay[kk] = GGML_F32_VEC_LOAD(&state_prev[h_2d_i_offset + j + kk * GGML_F32_EPR]);
+                                    sum[kk] = GGML_F32_VEC_FMA(sum[kk], ax[kk], ay[kk]);
+                                }
+                            }
+                            GGML_F32_VEC_REDUCE(sa, sum);
+                        }
-                    GGML_F32_VEC sa_vec = GGML_F32_VEC_SET1(sa);
+                        GGML_F32_VEC sa_vec = GGML_F32_VEC_SET1(sa);
-                    int64_t j = 0;
-                    GGML_F32_VEC result_vec[GGML_F32_ARR] = { GGML_F32_VEC_ZERO };
-                    for (; j < head_size; j += GGML_F32_STEP) {
-                        for (int64_t kk = 0; kk < GGML_F32_ARR; kk++) {
-                            int64_t t_h_j_offset = t_h_offset + j + kk * GGML_F32_EPR;
-                            int64_t h_2d_i_j_offset = h_2d_i_offset + j + kk * GGML_F32_EPR;
+                        int64_t j = 0;
+                        GGML_F32_VEC result_vec[GGML_F32_ARR] = { GGML_F32_VEC_ZERO };
+                        for (; j < head_size; j += GGML_F32_STEP) {
+                            for (int64_t kk = 0; kk < GGML_F32_ARR; kk++) {
+                                int64_t t_h_j_offset = t_h_offset + j + kk * GGML_F32_EPR;
+                                int64_t h_2d_i_j_offset = h_2d_i_offset + j + kk * GGML_F32_EPR;
-                            GGML_F32_VEC r_vec = GGML_F32_VEC_LOAD(&r[t_h_j_offset]);
-                            GGML_F32_VEC w_vec = GGML_F32_VEC_LOAD(&w[t_h_j_offset]);
-                            GGML_F32_VEC k_vec = GGML_F32_VEC_LOAD(&k[t_h_j_offset]);
-                            GGML_F32_VEC b_vec = GGML_F32_VEC_LOAD(&b[t_h_j_offset]);
+                                GGML_F32_VEC r_vec = GGML_F32_VEC_LOAD(&r[t_h_j_offset]);
+                                GGML_F32_VEC w_vec = GGML_F32_VEC_LOAD(&w[t_h_j_offset]);
+                                GGML_F32_VEC k_vec = GGML_F32_VEC_LOAD(&k[t_h_j_offset]);
+                                GGML_F32_VEC b_vec = GGML_F32_VEC_LOAD(&b[t_h_j_offset]);
-                            k_vec = GGML_F32_VEC_MUL(v_vec, k_vec);
+                                k_vec = GGML_F32_VEC_MUL(v_vec, k_vec);
-                            GGML_F32_VEC state_vec = GGML_F32_VEC_LOAD(&state_prev[h_2d_i_j_offset]);
-                            // kv + s * decay + sa * b
-                            state_vec = GGML_F32_VEC_FMA(k_vec, state_vec, w_vec);
-                            state_vec = GGML_F32_VEC_FMA(state_vec, sa_vec, b_vec);
-                            GGML_F32_VEC_STORE(&state_cur[h_2d_i_j_offset], state_vec);
+                                GGML_F32_VEC state_vec = GGML_F32_VEC_LOAD(&state_prev[h_2d_i_j_offset]);
+                                // kv + s * decay + sa * b
+                                state_vec = GGML_F32_VEC_FMA(k_vec, state_vec, w_vec);
+                                state_vec = GGML_F32_VEC_FMA(state_vec, sa_vec, b_vec);
+                                GGML_F32_VEC_STORE(&state_cur[h_2d_i_j_offset], state_vec);
-                            result_vec[kk] = GGML_F32_VEC_FMA(result_vec[kk], state_vec, r_vec);
+                                result_vec[kk] = GGML_F32_VEC_FMA(result_vec[kk], state_vec, r_vec);
+                            }
+                        }
+                        GGML_F32_VEC_REDUCE(dst_data[t_h_i_offset], result_vec);
+                        // There shouldn't be left-overs though.
+                        for (; j < head_size; j++) {
+                            int64_t t_h_j_offset = t_h_offset + j;
+                            int64_t h_2d_i_j_offset = h_2d_i_offset + j;
+                            float r_val = r[t_h_j_offset];
+                            float w_val = w[t_h_j_offset];
+                            float k_val = k[t_h_j_offset];
+                            float b_val = b[t_h_j_offset];
+                            float kv_val = v[t_h_i_offset] * k_val;
+                            float prev_state_val = state_prev[h_2d_i_j_offset];
+                            state_cur[h_2d_i_j_offset] = prev_state_val * w_val + kv_val + sa * b_val;
+                            dst_data[t_h_i_offset] += state_cur[h_2d_i_j_offset] * r_val;
                         }
-                    }
-                    GGML_F32_VEC_REDUCE(dst_data[t_h_i_offset], result_vec);
-                    // There shouldn't be left-overs though.
-                    for (; j < head_size; j++) {
-                        int64_t t_h_j_offset = t_h_offset + j;
-                        int64_t h_2d_i_j_offset = h_2d_i_offset + j;
-                        float r_val = r[t_h_j_offset];
-                        float w_val = w[t_h_j_offset];
-                        float k_val = k[t_h_j_offset];
-                        float b_val = b[t_h_j_offset];
-                        float kv_val = v[t_h_i_offset] * k_val;
-                        float prev_state_val = state_prev[h_2d_i_j_offset];
-                        state_cur[h_2d_i_j_offset] = prev_state_val * w_val + kv_val + sa * b_val;
-                        dst_data[t_h_i_offset] += state_cur[h_2d_i_j_offset] * r_val;
                     }
                 }
             }
-        }
+        #endif
     #else
         for (int64_t t = 0; t < T; t++) {
             int64_t t_offset = t * t_stride;

package/src/llama.cpp/ggml/src/ggml-cpu/ops.h CHANGED Viewed

@@ -72,6 +72,7 @@ void ggml_compute_forward_pool_2d_back(const struct ggml_compute_params * params
 void ggml_compute_forward_upscale(const struct ggml_compute_params * params, struct ggml_tensor * dst);
 void ggml_compute_forward_pad(const struct ggml_compute_params * params, struct ggml_tensor * dst);
 void ggml_compute_forward_pad_reflect_1d(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+void ggml_compute_forward_roll(const struct ggml_compute_params * params, struct ggml_tensor * dst);
 void ggml_compute_forward_arange(const struct ggml_compute_params * params, struct ggml_tensor * dst);
 void ggml_compute_forward_timestep_embedding(const struct ggml_compute_params * params, struct ggml_tensor * dst);
 void ggml_compute_forward_argsort(const struct ggml_compute_params * params, struct ggml_tensor * dst);