npm - @fugood/llama.node - Versions diffs - 1.0.2 → 1.0.4 - Mend

@fugood/llama.node 1.0.2 → 1.0.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (50) hide show

package/package.json +14 -14
package/src/llama.cpp/CMakeLists.txt +0 -1
package/src/llama.cpp/common/CMakeLists.txt +4 -5
package/src/llama.cpp/common/arg.cpp +44 -0
package/src/llama.cpp/common/common.cpp +22 -6
package/src/llama.cpp/common/common.h +15 -1
package/src/llama.cpp/ggml/CMakeLists.txt +10 -2
package/src/llama.cpp/ggml/include/ggml-webgpu.h +19 -0
package/src/llama.cpp/ggml/include/ggml.h +104 -10
package/src/llama.cpp/ggml/src/CMakeLists.txt +1 -1
package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +6 -1
package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +12 -1
package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +343 -1094
package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +749 -163
package/src/llama.cpp/ggml/src/ggml-cpu/ops.h +5 -0
package/src/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +1 -1
package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +12 -9
package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +88 -9
package/src/llama.cpp/include/llama.h +13 -47
package/src/llama.cpp/src/llama-arch.cpp +298 -3
package/src/llama.cpp/src/llama-arch.h +22 -1
package/src/llama.cpp/src/llama-batch.cpp +103 -71
package/src/llama.cpp/src/llama-batch.h +31 -18
package/src/llama.cpp/src/llama-chat.cpp +59 -1
package/src/llama.cpp/src/llama-chat.h +3 -0
package/src/llama.cpp/src/llama-context.cpp +134 -95
package/src/llama.cpp/src/llama-context.h +13 -16
package/src/llama.cpp/src/llama-cparams.h +3 -2
package/src/llama.cpp/src/llama-graph.cpp +279 -180
package/src/llama.cpp/src/llama-graph.h +183 -122
package/src/llama.cpp/src/llama-hparams.cpp +47 -1
package/src/llama.cpp/src/llama-hparams.h +12 -1
package/src/llama.cpp/src/llama-kv-cache-unified-iswa.cpp +38 -22
package/src/llama.cpp/src/llama-kv-cache-unified-iswa.h +7 -2
package/src/llama.cpp/src/llama-kv-cache-unified.cpp +849 -304
package/src/llama.cpp/src/llama-kv-cache-unified.h +143 -47
package/src/llama.cpp/src/llama-kv-cells.h +62 -10
package/src/llama.cpp/src/llama-memory-hybrid.cpp +10 -4
package/src/llama.cpp/src/llama-memory-hybrid.h +3 -1
package/src/llama.cpp/src/llama-memory-recurrent.cpp +21 -11
package/src/llama.cpp/src/llama-memory.cpp +17 -0
package/src/llama.cpp/src/llama-memory.h +3 -0
package/src/llama.cpp/src/llama-model.cpp +3373 -743
package/src/llama.cpp/src/llama-model.h +20 -4
package/src/llama.cpp/src/llama-quant.cpp +2 -2
package/src/llama.cpp/src/llama-vocab.cpp +376 -10
package/src/llama.cpp/src/llama-vocab.h +43 -0
package/src/llama.cpp/src/unicode.cpp +207 -0
package/src/llama.cpp/src/unicode.h +2 -0
package/src/llama.cpp/ggml/include/ggml-kompute.h +0 -50

package/src/llama.cpp/ggml/src/ggml-cpu/ops.h CHANGED Viewed

@@ -20,6 +20,9 @@
 static const size_t CACHE_LINE_SIZE_F32 = CACHE_LINE_SIZE/sizeof(float);
+// Work buffer size for im2col operations in CONV2D
+#define GGML_IM2COL_WORK_SIZE (16 * 1024 * 1024)
 #ifdef __cplusplus
 extern "C" {
 #endif
@@ -65,6 +68,7 @@ void ggml_compute_forward_clamp(const struct ggml_compute_params * params, struc
 void ggml_compute_forward_conv_transpose_1d(const struct ggml_compute_params * params, struct ggml_tensor * dst);
 void ggml_compute_forward_im2col(const struct ggml_compute_params * params, struct ggml_tensor * dst);
 void ggml_compute_forward_im2col_back_f32(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+void ggml_compute_forward_conv_2d(const struct ggml_compute_params * params, struct ggml_tensor * dst);
 void ggml_compute_forward_conv_transpose_2d(const struct ggml_compute_params * params, struct ggml_tensor * dst);
 void ggml_compute_forward_conv_2d_dw(const struct ggml_compute_params * params, struct ggml_tensor * dst);
 void ggml_compute_forward_pool_1d(const struct ggml_compute_params * params, struct ggml_tensor * dst);
@@ -107,6 +111,7 @@ void ggml_compute_forward_custom(const struct ggml_compute_params * params, stru
 void ggml_compute_forward_cross_entropy_loss(const struct ggml_compute_params * params, struct ggml_tensor * dst);
 void ggml_compute_forward_cross_entropy_loss_back(const struct ggml_compute_params * params, struct ggml_tensor * dst);
 void ggml_compute_forward_opt_step_adamw(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+void ggml_compute_forward_mul_mat(const struct ggml_compute_params * params, struct ggml_tensor * dst);
 #ifdef __cplusplus
 }

package/src/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h CHANGED Viewed

@@ -189,7 +189,7 @@ inline static float ggml_lookup_fp16_to_fp32(ggml_fp16_t f) {
 #define GGML_F32xt_LOAD(...)              GGML_F32xt_LOAD_IMPL(DEFAULT_PG, __VA_ARGS__)
 #define GGML_F32xt_STORE_IMPL(pg,a,b)     svst1_f32(pg, a, b)
 #define GGML_F32xt_STORE(...)             GGML_F32xt_STORE_IMPL(DEFAULT_PG, __VA_ARGS__)
-#define GGML_F32xt_FMA_IMPL(pg, a, b, c)  svmad_f32_m(pg, a, b, c)
+#define GGML_F32xt_FMA_IMPL(pg, a, b, c)  svmad_f32_m(pg, b, c, a)
 #define GGML_F32xt_FMA(...)               GGML_F32xt_FMA_IMPL(DEFAULT_PG, __VA_ARGS__)
 #define GGML_F32xt_ADD_IMPL(pg, a, b)     svadd_f32_m(pg, a, b)
 #define GGML_F32xt_ADD(...)               GGML_F32xt_ADD_IMPL(DEFAULT_PG, __VA_ARGS__)

package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp CHANGED Viewed

@@ -37,35 +37,35 @@ void ggml_vec_dot_f32(int n, float * GGML_RESTRICT s, size_t bs, const float * G
         for (int i = 0; i < np; i += ggml_f32_step) {
             ax1 = GGML_F32_VEC_LOAD(x + i);
             ay1 = GGML_F32_VEC_LOAD(y + i);
-            sum1 = GGML_F32_VEC_FMA(ax1, ay1, sum1);
+            sum1 = GGML_F32_VEC_FMA(sum1, ax1, ay1);
             ax2 = GGML_F32_VEC_LOAD(x + i + 1*ggml_f32_epr);
             ay2 = GGML_F32_VEC_LOAD(y + i + 1*ggml_f32_epr);
-            sum2 = GGML_F32_VEC_FMA(ax2, ay2, sum2);
+            sum2 = GGML_F32_VEC_FMA(sum2, ax2, ay2);
             ax3 = GGML_F32_VEC_LOAD(x + i + 2*ggml_f32_epr);
             ay3 = GGML_F32_VEC_LOAD(y + i + 2*ggml_f32_epr);
-            sum3 = GGML_F32_VEC_FMA(ax3, ay3, sum3);
+            sum3 = GGML_F32_VEC_FMA(sum3, ax3, ay3);
             ax4 = GGML_F32_VEC_LOAD(x + i + 3*ggml_f32_epr);
             ay4 = GGML_F32_VEC_LOAD(y + i + 3*ggml_f32_epr);
-            sum4 = GGML_F32_VEC_FMA(ax4, ay4, sum4);
+            sum4 = GGML_F32_VEC_FMA(sum4, ax4, ay4);
             ax5 = GGML_F32_VEC_LOAD(x + i + 4*ggml_f32_epr);
             ay5 = GGML_F32_VEC_LOAD(y + i + 4*ggml_f32_epr);
-            sum5 = GGML_F32_VEC_FMA(ax5, ay5, sum5);
+            sum5 = GGML_F32_VEC_FMA(sum5, ax5, ay5);
             ax6 = GGML_F32_VEC_LOAD(x + i + 5*ggml_f32_epr);
             ay6 = GGML_F32_VEC_LOAD(y + i + 5*ggml_f32_epr);
-            sum6 = GGML_F32_VEC_FMA(ax6, ay6, sum6);
+            sum6 = GGML_F32_VEC_FMA(sum6, ax6, ay6);
             ax7 = GGML_F32_VEC_LOAD(x + i + 6*ggml_f32_epr);
             ay7 = GGML_F32_VEC_LOAD(y + i + 6*ggml_f32_epr);
-            sum7 = GGML_F32_VEC_FMA(ax7, ay7, sum7);
+            sum7 = GGML_F32_VEC_FMA(sum7, ax7, ay7);
             ax8 = GGML_F32_VEC_LOAD(x + i + 7*ggml_f32_epr);
             ay8 = GGML_F32_VEC_LOAD(y + i + 7*ggml_f32_epr);
-            sum8 = GGML_F32_VEC_FMA(ax8, ay8, sum8);
+            sum8 = GGML_F32_VEC_FMA(sum8, ax8, ay8);
         }
         // leftovers
         // Since 8 unrolls are done in above loop, leftovers lie in range [0, ggml_f32_step] which is handled in below loop
@@ -73,7 +73,7 @@ void ggml_vec_dot_f32(int n, float * GGML_RESTRICT s, size_t bs, const float * G
         for (int i = np; i < np2; i += ggml_f32_epr) {
             ax1 = GGML_F32_VEC_LOAD(x + i);
             ay1 = GGML_F32_VEC_LOAD(y + i);
-            sum1 = GGML_F32_VEC_FMA(ax1, ay1, sum1);
+            sum1 = GGML_F32_VEC_FMA(sum1, ax1, ay1);
         }
         // maximum number of leftover elements will be less that ggml_f32_epr. Apply predicated svmad on available elements only
         if (np2 < n) {
@@ -221,6 +221,9 @@ void ggml_vec_dot_f16(int n, float * GGML_RESTRICT s, size_t bs, ggml_fp16_t * G
     for (int i = np; i < n; ++i) {
         sumf += (ggml_float)(GGML_CPU_FP16_TO_FP32(x[i])*GGML_CPU_FP16_TO_FP32(y[i]));
     }
+    // if you hit this, you are likely running outside the FP range
+    assert(!isnan(sumf) && !isinf(sumf));
 #else
     for (int i = 0; i < n; ++i) {
         sumf += (ggml_float)(GGML_CPU_FP16_TO_FP32(x[i])*GGML_CPU_FP16_TO_FP32(y[i]));

package/src/llama.cpp/ggml/src/ggml-cpu/vec.h CHANGED Viewed

@@ -163,49 +163,49 @@ inline static void ggml_vec_mad_f32(const int n, float * GGML_RESTRICT y, const
             ax1 = GGML_F32_VEC_LOAD(x + i);
             ay1 = GGML_F32_VEC_LOAD(y + i);
-            ay1 = GGML_F32_VEC_FMA(ax1, vx, ay1);
+            ay1 = GGML_F32_VEC_FMA(ay1, ax1, vx);
             GGML_F32_VEC_STORE(y + i, ay1);
             ax2 = GGML_F32_VEC_LOAD(x + i + 1*ggml_f32_epr);
             ay2 = GGML_F32_VEC_LOAD(y + i + 1*ggml_f32_epr);
-            ay2 = GGML_F32_VEC_FMA(ax2, vx, ay2);
+            ay2 = GGML_F32_VEC_FMA(ay2, ax2, vx);
             GGML_F32_VEC_STORE(y + i + 1*ggml_f32_epr, ay2);
             ax3 = GGML_F32_VEC_LOAD(x + i + 2*ggml_f32_epr);
             ay3 = GGML_F32_VEC_LOAD(y + i + 2*ggml_f32_epr);
-            ay3 = GGML_F32_VEC_FMA(ax3, vx, ay3);
+            ay3 = GGML_F32_VEC_FMA(ay3, ax3, vx);
             GGML_F32_VEC_STORE(y + i + 2*ggml_f32_epr, ay3);
             ax4 = GGML_F32_VEC_LOAD(x + i + 3*ggml_f32_epr);
             ay4 = GGML_F32_VEC_LOAD(y + i + 3*ggml_f32_epr);
-            ay4 = GGML_F32_VEC_FMA(ax4, vx, ay4);
+            ay4 = GGML_F32_VEC_FMA(ay4, ax4, vx);
             GGML_F32_VEC_STORE(y + i + 3*ggml_f32_epr, ay4);
             ax5 = GGML_F32_VEC_LOAD(x + i + 4*ggml_f32_epr);
             ay5 = GGML_F32_VEC_LOAD(y + i + 4*ggml_f32_epr);
-            ay5 = GGML_F32_VEC_FMA(ax5, vx, ay5);
+            ay5 = GGML_F32_VEC_FMA(ay5, ax5, vx);
             GGML_F32_VEC_STORE(y + i + 4*ggml_f32_epr, ay5);
             ax6 = GGML_F32_VEC_LOAD(x + i + 5*ggml_f32_epr);
             ay6 = GGML_F32_VEC_LOAD(y + i + 5*ggml_f32_epr);
-            ay6 = GGML_F32_VEC_FMA(ax6, vx, ay6);
+            ay6 = GGML_F32_VEC_FMA(ay6, ax6, vx);
             GGML_F32_VEC_STORE(y + i + 5*ggml_f32_epr, ay6);
             ax7 = GGML_F32_VEC_LOAD(x + i + 6*ggml_f32_epr);
             ay7 = GGML_F32_VEC_LOAD(y + i + 6*ggml_f32_epr);
-            ay7 = GGML_F32_VEC_FMA(ax7, vx, ay7);
+            ay7 = GGML_F32_VEC_FMA(ay7, ax7, vx);
             GGML_F32_VEC_STORE(y + i + 6*ggml_f32_epr, ay7);
             ax8 = GGML_F32_VEC_LOAD(x + i + 7*ggml_f32_epr);
             ay8 = GGML_F32_VEC_LOAD(y + i + 7*ggml_f32_epr);
-            ay8 = GGML_F32_VEC_FMA(ax8, vx, ay8);
+            ay8 = GGML_F32_VEC_FMA(ay8, ax8, vx);
             GGML_F32_VEC_STORE(y + i + 7*ggml_f32_epr, ay8);
         }
@@ -215,7 +215,7 @@ inline static void ggml_vec_mad_f32(const int n, float * GGML_RESTRICT y, const
         for (int i = np; i < np2; i += ggml_f32_epr) {
             ax1 = GGML_F32_VEC_LOAD(x + i);
             ay1 = GGML_F32_VEC_LOAD(y + i);
-            ay1 = GGML_F32_VEC_FMA(ax1, vx, ay1);
+            ay1 = GGML_F32_VEC_FMA(ay1, ax1, vx);
             GGML_F32_VEC_STORE(y + i, ay1);
         }
@@ -351,6 +351,45 @@ inline static void ggml_vec_mad_f32_unroll(const int n, const int xs, const int
 #endif
 }
+inline static void ggml_vec_mad1_f32(const int n, float * y, const float * x, const float s, const float b) {
+#if defined(GGML_USE_ACCELERATE)
+    vDSP_vsmsa(x, 1, &s, &b, y, 1, n);
+#elif defined(GGML_SIMD)
+    #if defined(__ARM_FEATURE_SVE)
+        // scalar ; TODO: Write SVE code
+        for (int i = 0; i < n; ++i) {
+            y[i] = x[i]*s + b;
+        }
+    #else
+        const int np = (n & ~(GGML_F32_STEP - 1));
+        GGML_F32_VEC vs = GGML_F32_VEC_SET1(s);
+        GGML_F32_VEC vb = GGML_F32_VEC_SET1(b);
+        GGML_F32_VEC ay[GGML_F32_ARR];
+        for (int i = 0; i < np; i += GGML_F32_STEP) {
+            for (int j = 0; j < GGML_F32_ARR; j++) {
+                ay[j] = GGML_F32_VEC_LOAD(x + i + j*GGML_F32_EPR);
+                ay[j] = GGML_F32_VEC_FMA(ay[j], vs, vb);
+                GGML_F32_VEC_STORE(y + i + j*GGML_F32_EPR, ay[j]);
+            }
+        }
+        // leftovers
+        for (int i = np; i < n; ++i) {
+            y[i] = x[i]*s + b;
+        }
+    #endif
+#else
+    // scalar
+    for (int i = 0; i < n; ++i) {
+        y[i] = x[i]*s + b;
+    }
+#endif
+}
 //inline static void ggml_vec_scale_f32(const int n, float * y, const float   v) { for (int i = 0; i < n; ++i) y[i] *= v;          }
 inline static void ggml_vec_scale_f32(const int n, float * y, const float   v) {
 #if defined(GGML_USE_ACCELERATE)
@@ -959,6 +998,46 @@ inline static void ggml_vec_swiglu_f16(const int n, ggml_fp16_t * y, const ggml_
     }
 }
+inline static void ggml_vec_geglu_erf_f32(const int n, float * y, const float * x, const float * g) {
+    for (int i = 0; i < n; ++i) {
+        float xi = x[i];
+        y[i] = 0.5f * xi * (1.0f + erff(xi*SQRT_2_INV)) * g[i];
+    }
+}
+inline static void ggml_vec_geglu_erf_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x, const ggml_fp16_t * g) {
+    for (int i = 0; i < n; ++i) {
+        float xi = GGML_CPU_FP16_TO_FP32(x[i]);
+        float gi = GGML_CPU_FP16_TO_FP32(g[i]);
+        y[i] = GGML_CPU_FP32_TO_FP16(0.5f * xi * (1.0f + erff(xi*SQRT_2_INV)) * gi);
+    }
+}
+#ifdef GGML_GELU_QUICK_FP16
+inline static void ggml_vec_geglu_quick_f32(const int n, float * y, const float * x, const float * g) {
+    uint16_t t;
+    for (int i = 0; i < n; ++i) {
+        ggml_fp16_t fp16 = GGML_CPU_FP32_TO_FP16(x[i]);
+        memcpy(&t, &fp16, sizeof(uint16_t));
+        y[i] = GGML_CPU_FP16_TO_FP32(ggml_table_gelu_quick_f16[t]) * g[i];
+    }
+}
+#else
+inline static void ggml_vec_geglu_quick_f32(const int n, float * y, const float * x, const float * g) {
+    for (int i = 0; i < n; ++i) {
+        y[i] = ggml_gelu_quick_f32(x[i]) * g[i];
+    }
+}
+#endif
+inline static void ggml_vec_geglu_quick_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x, const ggml_fp16_t * g) {
+    const uint16_t * i16 = (const uint16_t *) x;
+    for (int i = 0; i < n; ++i) {
+        float v = GGML_CPU_FP16_TO_FP32(g[i]);
+        y[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(ggml_table_gelu_quick_f16[i16[i]]) * v);
+    }
+}
 inline static void ggml_vec_sum_f32(const int n, float * s, const float * x) {
 #ifndef GGML_USE_ACCELERATE
     ggml_float sum = 0.0;

package/src/llama.cpp/include/llama.h CHANGED Viewed

@@ -71,52 +71,13 @@ extern "C" {
     typedef int32_t llama_seq_id;
     enum llama_vocab_type {
-        LLAMA_VOCAB_TYPE_NONE = 0, // For models without vocab
-        LLAMA_VOCAB_TYPE_SPM  = 1, // LLaMA tokenizer based on byte-level BPE with byte fallback
-        LLAMA_VOCAB_TYPE_BPE  = 2, // GPT-2 tokenizer based on byte-level BPE
-        LLAMA_VOCAB_TYPE_WPM  = 3, // BERT tokenizer based on WordPiece
-        LLAMA_VOCAB_TYPE_UGM  = 4, // T5 tokenizer based on Unigram
-        LLAMA_VOCAB_TYPE_RWKV = 5, // RWKV tokenizer based on greedy tokenization
-    };
-    // pre-tokenization types
-    enum llama_vocab_pre_type {
-        LLAMA_VOCAB_PRE_TYPE_DEFAULT        = 0,
-        LLAMA_VOCAB_PRE_TYPE_LLAMA3         = 1,
-        LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM   = 2,
-        LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER = 3,
-        LLAMA_VOCAB_PRE_TYPE_FALCON         = 4,
-        LLAMA_VOCAB_PRE_TYPE_MPT            = 5,
-        LLAMA_VOCAB_PRE_TYPE_STARCODER      = 6,
-        LLAMA_VOCAB_PRE_TYPE_GPT2           = 7,
-        LLAMA_VOCAB_PRE_TYPE_REFACT         = 8,
-        LLAMA_VOCAB_PRE_TYPE_COMMAND_R      = 9,
-        LLAMA_VOCAB_PRE_TYPE_STABLELM2      = 10,
-        LLAMA_VOCAB_PRE_TYPE_QWEN2          = 11,
-        LLAMA_VOCAB_PRE_TYPE_OLMO           = 12,
-        LLAMA_VOCAB_PRE_TYPE_DBRX           = 13,
-        LLAMA_VOCAB_PRE_TYPE_SMAUG          = 14,
-        LLAMA_VOCAB_PRE_TYPE_PORO           = 15,
-        LLAMA_VOCAB_PRE_TYPE_CHATGLM3       = 16,
-        LLAMA_VOCAB_PRE_TYPE_CHATGLM4       = 17,
-        LLAMA_VOCAB_PRE_TYPE_VIKING         = 18,
-        LLAMA_VOCAB_PRE_TYPE_JAIS           = 19,
-        LLAMA_VOCAB_PRE_TYPE_TEKKEN         = 20,
-        LLAMA_VOCAB_PRE_TYPE_SMOLLM         = 21,
-        LLAMA_VOCAB_PRE_TYPE_CODESHELL      = 22,
-        LLAMA_VOCAB_PRE_TYPE_BLOOM          = 23,
-        LLAMA_VOCAB_PRE_TYPE_GPT3_FINNISH   = 24,
-        LLAMA_VOCAB_PRE_TYPE_EXAONE         = 25,
-        LLAMA_VOCAB_PRE_TYPE_CHAMELEON      = 26,
-        LLAMA_VOCAB_PRE_TYPE_MINERVA        = 27,
-        LLAMA_VOCAB_PRE_TYPE_DEEPSEEK3_LLM  = 28,
-        LLAMA_VOCAB_PRE_TYPE_GPT4O          = 29,
-        LLAMA_VOCAB_PRE_TYPE_SUPERBPE       = 30,
-        LLAMA_VOCAB_PRE_TYPE_TRILLION       = 31,
-        LLAMA_VOCAB_PRE_TYPE_BAILINGMOE     = 32,
-        LLAMA_VOCAB_PRE_TYPE_LLAMA4         = 33,
-        LLAMA_VOCAB_PRE_TYPE_PIXTRAL        = 34,
-        LLAMA_VOCAB_PRE_TYPE_SEED_CODER     = 35,
+        LLAMA_VOCAB_TYPE_NONE   = 0, // For models without vocab
+        LLAMA_VOCAB_TYPE_SPM    = 1, // LLaMA tokenizer based on byte-level BPE with byte fallback
+        LLAMA_VOCAB_TYPE_BPE    = 2, // GPT-2 tokenizer based on byte-level BPE
+        LLAMA_VOCAB_TYPE_WPM    = 3, // BERT tokenizer based on WordPiece
+        LLAMA_VOCAB_TYPE_UGM    = 4, // T5 tokenizer based on Unigram
+        LLAMA_VOCAB_TYPE_RWKV   = 5, // RWKV tokenizer based on greedy tokenization
+        LLAMA_VOCAB_TYPE_PLAMO2 = 6, // PLaMo-2 tokenizer based on Aho-Corasick with dynamic programming
     };
     enum llama_rope_type {
@@ -374,6 +335,9 @@ extern "C" {
         bool swa_full;    // use full-size SWA cache (https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)
                           // NOTE: setting to false when n_seq_max > 1 can cause bad performance in some cases
                           //       ref: https://github.com/ggml-org/llama.cpp/pull/13845#issuecomment-2924800573
+        bool kv_unified;  // use a unified buffer across the input sequences when computing the attention
+                          // try to disable when n_seq_max > 1 for improved performance when the sequences do not share a large prefix
+                          // ref: https://github.com/ggml-org/llama.cpp/pull/14363
     };
     // model quantization parameters
@@ -764,7 +728,7 @@ extern "C" {
     //   - lazily on next llama_decode()
     // p0 < 0 : [0,  p1]
     // p1 < 0 : [p0, inf)
-    DEPRECATED(void llama_kv_self_seq_div(
+    DEPRECATED(LLAMA_API void llama_kv_self_seq_div(
             struct llama_context * ctx,
                     llama_seq_id   seq_id,
                        llama_pos   p0,
@@ -1044,6 +1008,7 @@ extern "C" {
     LLAMA_API llama_token llama_vocab_sep(const struct llama_vocab * vocab); // sentence separator
     LLAMA_API llama_token llama_vocab_nl (const struct llama_vocab * vocab); // next-line
     LLAMA_API llama_token llama_vocab_pad(const struct llama_vocab * vocab); // padding
+    LLAMA_API llama_token llama_vocab_mask(const struct llama_vocab * vocab); // mask
     LLAMA_API bool llama_vocab_get_add_bos(const struct llama_vocab * vocab);
     LLAMA_API bool llama_vocab_get_add_eos(const struct llama_vocab * vocab);
@@ -1429,6 +1394,7 @@ extern "C" {
         int32_t n_p_eval;
         int32_t n_eval;
+        int32_t n_reused; // number of times a ggml compute graph had been reused
     };
     struct llama_perf_sampler_data {