npm - @fugood/llama.node - Versions diffs - 1.3.0-rc.6 → 1.3.1 - Mend

@fugood/llama.node 1.3.0-rc.6 → 1.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (147) hide show

package/CMakeLists.txt +12 -2
package/package.json +14 -14
package/scripts/llama.cpp.patch +8 -9
package/src/llama.cpp/common/CMakeLists.txt +2 -0
package/src/llama.cpp/common/arg.cpp +39 -1001
package/src/llama.cpp/common/arg.h +2 -2
package/src/llama.cpp/common/chat.cpp +216 -2
package/src/llama.cpp/common/chat.h +1 -0
package/src/llama.cpp/common/common.cpp +33 -0
package/src/llama.cpp/common/common.h +13 -0
package/src/llama.cpp/common/download.cpp +1054 -0
package/src/llama.cpp/common/download.h +55 -0
package/src/llama.cpp/common/json-schema-to-grammar.cpp +19 -3
package/src/llama.cpp/ggml/CMakeLists.txt +3 -1
package/src/llama.cpp/ggml/include/ggml-hexagon.h +19 -0
package/src/llama.cpp/ggml/include/ggml.h +2 -0
package/src/llama.cpp/ggml/src/CMakeLists.txt +7 -3
package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +10 -3
package/src/llama.cpp/ggml/src/ggml-cpu/arch/loongarch/quants.c +4 -5
package/src/llama.cpp/ggml/src/ggml-cpu/arch/riscv/quants.c +108 -49
package/src/llama.cpp/ggml/src/ggml-cpu/arch/s390/cpu-feats.cpp +50 -0
package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +3 -1
package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +0 -5
package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +172 -35
package/src/llama.cpp/ggml/src/ggml-cpu/repack.cpp +82 -21
package/src/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +25 -25
package/src/llama.cpp/include/llama.h +7 -3
package/src/llama.cpp/src/CMakeLists.txt +95 -0
package/src/llama.cpp/src/llama-arch.cpp +108 -0
package/src/llama.cpp/src/llama-arch.h +11 -0
package/src/llama.cpp/src/llama-batch.cpp +63 -31
package/src/llama.cpp/src/llama-batch.h +12 -1
package/src/llama.cpp/src/llama-chat.cpp +32 -0
package/src/llama.cpp/src/llama-chat.h +1 -0
package/src/llama.cpp/src/llama-context.cpp +44 -16
package/src/llama.cpp/src/llama-context.h +5 -5
package/src/llama.cpp/src/llama-cparams.h +1 -0
package/src/llama.cpp/src/llama-graph.cpp +12 -7
package/src/llama.cpp/src/llama-hparams.cpp +11 -1
package/src/llama.cpp/src/llama-hparams.h +6 -0
package/src/llama.cpp/src/llama-kv-cache-iswa.cpp +3 -1
package/src/llama.cpp/src/llama-kv-cache.cpp +56 -21
package/src/llama.cpp/src/llama-kv-cache.h +2 -4
package/src/llama.cpp/src/llama-kv-cells.h +44 -2
package/src/llama.cpp/src/llama-memory-recurrent.cpp +18 -14
package/src/llama.cpp/src/llama-memory-recurrent.h +2 -2
package/src/llama.cpp/src/llama-model.cpp +350 -13194
package/src/llama.cpp/src/llama-model.h +9 -2
package/src/llama.cpp/src/llama-quant.cpp +1 -1
package/src/llama.cpp/src/llama-vocab.cpp +5 -0
package/src/llama.cpp/src/llama-vocab.h +1 -0
package/src/llama.cpp/src/models/apertus.cpp +125 -0
package/src/llama.cpp/src/models/arcee.cpp +135 -0
package/src/llama.cpp/src/models/arctic.cpp +138 -0
package/src/llama.cpp/src/models/arwkv7.cpp +86 -0
package/src/llama.cpp/src/models/baichuan.cpp +122 -0
package/src/llama.cpp/src/models/bailingmoe.cpp +144 -0
package/src/llama.cpp/src/models/bailingmoe2.cpp +135 -0
package/src/llama.cpp/src/models/bert.cpp +176 -0
package/src/llama.cpp/src/models/bitnet.cpp +160 -0
package/src/llama.cpp/src/models/bloom.cpp +101 -0
package/src/llama.cpp/src/models/chameleon.cpp +178 -0
package/src/llama.cpp/src/models/chatglm.cpp +132 -0
package/src/llama.cpp/src/models/codeshell.cpp +111 -0
package/src/llama.cpp/src/models/cogvlm.cpp +100 -0
package/src/llama.cpp/src/models/cohere2-iswa.cpp +131 -0
package/src/llama.cpp/src/models/command-r.cpp +122 -0
package/src/llama.cpp/src/models/dbrx.cpp +123 -0
package/src/llama.cpp/src/models/deci.cpp +135 -0
package/src/llama.cpp/src/models/deepseek.cpp +144 -0
package/src/llama.cpp/src/models/deepseek2.cpp +236 -0
package/src/llama.cpp/src/models/dots1.cpp +134 -0
package/src/llama.cpp/src/models/dream.cpp +105 -0
package/src/llama.cpp/src/models/ernie4-5-moe.cpp +150 -0
package/src/llama.cpp/src/models/ernie4-5.cpp +111 -0
package/src/llama.cpp/src/models/exaone.cpp +114 -0
package/src/llama.cpp/src/models/exaone4.cpp +123 -0
package/src/llama.cpp/src/models/falcon-h1.cpp +113 -0
package/src/llama.cpp/src/models/falcon.cpp +120 -0
package/src/llama.cpp/src/models/gemma-embedding.cpp +120 -0
package/src/llama.cpp/src/models/gemma.cpp +112 -0
package/src/llama.cpp/src/models/gemma2-iswa.cpp +125 -0
package/src/llama.cpp/src/models/gemma3-iswa.cpp +131 -0
package/src/llama.cpp/src/models/gemma3n-iswa.cpp +377 -0
package/src/llama.cpp/src/models/glm4-moe.cpp +153 -0
package/src/llama.cpp/src/models/glm4.cpp +127 -0
package/src/llama.cpp/src/models/gpt2.cpp +105 -0
package/src/llama.cpp/src/models/gptneox.cpp +144 -0
package/src/llama.cpp/src/models/granite-hybrid.cpp +196 -0
package/src/llama.cpp/src/models/granite.cpp +211 -0
package/src/llama.cpp/src/models/graph-context-mamba.cpp +283 -0
package/src/llama.cpp/src/models/grok.cpp +159 -0
package/src/llama.cpp/src/models/grovemoe.cpp +141 -0
package/src/llama.cpp/src/models/hunyuan-dense.cpp +132 -0
package/src/llama.cpp/src/models/hunyuan-moe.cpp +154 -0
package/src/llama.cpp/src/models/internlm2.cpp +120 -0
package/src/llama.cpp/src/models/jais.cpp +86 -0
package/src/llama.cpp/src/models/jamba.cpp +106 -0
package/src/llama.cpp/src/models/lfm2.cpp +173 -0
package/src/llama.cpp/src/models/llada-moe.cpp +122 -0
package/src/llama.cpp/src/models/llada.cpp +99 -0
package/src/llama.cpp/src/models/llama-iswa.cpp +174 -0
package/src/llama.cpp/src/models/llama.cpp +155 -0
package/src/llama.cpp/src/models/mamba.cpp +55 -0
package/src/llama.cpp/src/models/minicpm3.cpp +199 -0
package/src/llama.cpp/src/models/minimax-m2.cpp +124 -0
package/src/llama.cpp/src/models/models.h +481 -0
package/src/llama.cpp/src/models/mpt.cpp +126 -0
package/src/llama.cpp/src/models/nemotron-h.cpp +121 -0
package/src/llama.cpp/src/models/nemotron.cpp +122 -0
package/src/llama.cpp/src/models/neo-bert.cpp +104 -0
package/src/llama.cpp/src/models/olmo.cpp +121 -0
package/src/llama.cpp/src/models/olmo2.cpp +150 -0
package/src/llama.cpp/src/models/olmoe.cpp +124 -0
package/src/llama.cpp/src/models/openai-moe-iswa.cpp +123 -0
package/src/llama.cpp/src/models/openelm.cpp +124 -0
package/src/llama.cpp/src/models/orion.cpp +123 -0
package/src/llama.cpp/src/models/pangu-embedded.cpp +121 -0
package/src/llama.cpp/src/models/phi2.cpp +121 -0
package/src/llama.cpp/src/models/phi3.cpp +152 -0
package/src/llama.cpp/src/models/plamo.cpp +110 -0
package/src/llama.cpp/src/models/plamo2.cpp +316 -0
package/src/llama.cpp/src/models/plm.cpp +168 -0
package/src/llama.cpp/src/models/qwen.cpp +108 -0
package/src/llama.cpp/src/models/qwen2.cpp +117 -0
package/src/llama.cpp/src/models/qwen2moe.cpp +151 -0
package/src/llama.cpp/src/models/qwen2vl.cpp +117 -0
package/src/llama.cpp/src/models/qwen3.cpp +117 -0
package/src/llama.cpp/src/models/qwen3moe.cpp +124 -0
package/src/llama.cpp/src/models/qwen3vl-moe.cpp +149 -0
package/src/llama.cpp/src/models/qwen3vl.cpp +141 -0
package/src/llama.cpp/src/models/refact.cpp +94 -0
package/src/llama.cpp/src/models/rwkv6-base.cpp +162 -0
package/src/llama.cpp/src/models/rwkv6.cpp +94 -0
package/src/llama.cpp/src/models/rwkv6qwen2.cpp +86 -0
package/src/llama.cpp/src/models/rwkv7-base.cpp +135 -0
package/src/llama.cpp/src/models/rwkv7.cpp +90 -0
package/src/llama.cpp/src/models/seed-oss.cpp +124 -0
package/src/llama.cpp/src/models/smallthinker.cpp +120 -0
package/src/llama.cpp/src/models/smollm3.cpp +128 -0
package/src/llama.cpp/src/models/stablelm.cpp +146 -0
package/src/llama.cpp/src/models/starcoder.cpp +100 -0
package/src/llama.cpp/src/models/starcoder2.cpp +121 -0
package/src/llama.cpp/src/models/t5-dec.cpp +166 -0
package/src/llama.cpp/src/models/t5-enc.cpp +96 -0
package/src/llama.cpp/src/models/wavtokenizer-dec.cpp +149 -0
package/src/llama.cpp/src/models/xverse.cpp +108 -0

package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp CHANGED Viewed

@@ -5474,7 +5474,7 @@ static void ggml_rope_cache_init(
 }
 static void ggml_mrope_cache_init(
-     float theta_base_t, float theta_base_h, float theta_base_w, float theta_base_e, int sections[4], bool indep_sects,
+     float theta_base_t, float theta_base_h, float theta_base_w, float theta_base_e, int sections[4], bool is_imrope, bool indep_sects,
      float freq_scale, const float * freq_factors, float corr_dims[2], int64_t ne0, float ext_factor, float mscale,
      float * cache, float sin_sign, float theta_scale) {
     // ref: https://github.com/jquesnelle/yarn/blob/master/scaled_rope/LlamaYaRNScaledRotaryEmbedding.py
@@ -5509,14 +5509,26 @@ static void ggml_mrope_cache_init(
         }
         float theta = theta_t;
-        if (sector >= sections[0] && sector < sec_w) {
-            theta = theta_h;
-        }
-        else if (sector >= sec_w && sector < sec_w + sections[2]) {
-            theta = theta_w;
-        }
-        else if (sector >= sec_w + sections[2]) {
-            theta = theta_e;
+        if (is_imrope) { // qwen3vl apply interleaved mrope
+            if (sector % 3 == 1 && sector < 3 * sections[1]) {
+                theta = theta_h;
+            } else if (sector % 3 == 2 && sector < 3 * sections[2]) {
+                theta = theta_w;
+            } else if (sector % 3 == 0 && sector < 3 * sections[0]) {
+                theta = theta_t;
+            } else {
+                theta = theta_e;
+            }
+        } else {
+            if (sector >= sections[0] && sector < sec_w) {
+                theta = theta_h;
+            }
+            else if (sector >= sec_w && sector < sec_w + sections[2]) {
+                theta = theta_w;
+            }
+            else if (sector >= sec_w + sections[2]) {
+                theta = theta_e;
+            }
         }
         rope_yarn(
@@ -5589,6 +5601,7 @@ static void ggml_compute_forward_rope_f32(
     const bool is_neox = mode & GGML_ROPE_TYPE_NEOX;
     const bool is_mrope = mode & GGML_ROPE_TYPE_MROPE;  // ggml_rope_multi, multimodal rotary position embedding
+    const bool is_imrope = mode == GGML_ROPE_TYPE_IMROPE; // qwen3vl apply interleaved mrope
     const bool is_vision = mode == GGML_ROPE_TYPE_VISION;
     if (is_mrope) {
@@ -5627,7 +5640,7 @@ static void ggml_compute_forward_rope_f32(
                 const int64_t p_w = pos[i2 + ne2 * 2];
                 const int64_t p_e = pos[i2 + ne2 * 3];
                 ggml_mrope_cache_init(
-                    p_t, p_h, p_w, p_e, sections, is_vision,
+                    p_t, p_h, p_w, p_e, sections, is_imrope, is_vision,
                     freq_scale, freq_factors, corr_dims, ne0, ext_factor, attn_factor, cache, sin_sign, theta_scale);
             }
@@ -5775,6 +5788,7 @@ static void ggml_compute_forward_rope_f16(
     const bool is_neox = mode & GGML_ROPE_TYPE_NEOX;
     const bool is_mrope = mode & GGML_ROPE_TYPE_MROPE;
+    const bool is_imrope = mode == GGML_ROPE_TYPE_IMROPE;
     const bool is_vision = mode == GGML_ROPE_TYPE_VISION;
     if (is_mrope) {
@@ -5813,7 +5827,7 @@ static void ggml_compute_forward_rope_f16(
                 const int64_t p_w = pos[i2 + ne2 * 2];
                 const int64_t p_e = pos[i2 + ne2 * 3];
                 ggml_mrope_cache_init(
-                    p_t, p_h, p_w, p_e, sections, is_vision,
+                    p_t, p_h, p_w, p_e, sections, is_imrope, is_vision,
                     freq_scale, freq_factors, corr_dims, ne0, ext_factor, attn_factor, cache, sin_sign, theta_scale);
             }
@@ -7070,7 +7084,11 @@ static void ggml_compute_forward_conv_2d_dw_cwhn(
     const int64_t row_end = MIN(row_start + rows_per_thread, rows_total);
 #ifdef GGML_SIMD
-    const int64_t pkg_size = GGML_F32_EPR;
+    #if defined(__ARM_FEATURE_SVE)
+        const int64_t pkg_size = svcntw();
+    #else
+        const int64_t pkg_size = GGML_F32_EPR;
+    #endif
     const int64_t pkg_count = c / pkg_size;
     const int64_t c_pkg_end = pkg_count * pkg_size;
 #else
@@ -7493,10 +7511,17 @@ static void ggml_compute_forward_upscale_f32(
     float sf1 = (float)ne1/src0->ne[1];
     float sf2 = (float)ne2/src0->ne[2];
     float sf3 = (float)ne3/src0->ne[3];
+    float pixel_offset = 0.5f;
     const int32_t mode_flags = ggml_get_op_params_i32(dst, 0);
     const ggml_scale_mode mode = (ggml_scale_mode) (mode_flags & 0xFF);
+    if (mode_flags & GGML_SCALE_FLAG_ALIGN_CORNERS) {
+        pixel_offset = 0.0f;
+        sf0 = ne0 > 1 && ne00 > 1 ? (float)(ne0 - 1) / (ne00 - 1) : sf0;
+        sf1 = ne1 > 1 && ne01 > 1 ? (float)(ne1 - 1) / (ne01 - 1) : sf1;
+    }
     if (mode == GGML_SCALE_MODE_NEAREST) {
         for (int64_t i3 = 0; i3 < ne3; i3++) {
             const int64_t i03 = i3 / sf3;
@@ -7516,13 +7541,6 @@ static void ggml_compute_forward_upscale_f32(
             }
         }
     } else if (mode == GGML_SCALE_MODE_BILINEAR) {
-        float pixel_offset = 0.5f;
-        if (mode_flags & GGML_SCALE_FLAG_ALIGN_CORNERS) {
-            pixel_offset = 0.0f;
-            sf0 = (float)(ne0 - 1) / (src0->ne[0] - 1);
-            sf1 = (float)(ne1 - 1) / (src0->ne[1] - 1);
-        }
         for (int64_t i3 = 0; i3 < ne3; i3++) {
             const int64_t i03 = i3 / sf3;
             for (int64_t i2 = ith; i2 < ne2; i2 += nth) {
@@ -7557,6 +7575,51 @@ static void ggml_compute_forward_upscale_f32(
                         const float val = a*(1 - dx)*(1 - dy) + b*dx*(1 - dy) + c*(1 - dx)*dy + d*dx*dy;
+                        float * y_dst = (float *)((char *)dst->data + i0*nb0 + i1*nb1 + i2*nb2 + i3*nb3);
+                        *y_dst = val;
+                    }
+                }
+            }
+        }
+    } else if (mode == GGML_SCALE_MODE_BICUBIC) {
+        // https://en.wikipedia.org/wiki/Bicubic_interpolation#Bicubic_convolution_algorithm
+        const float a = -0.75f; // use alpha = -0.75 (same as PyTorch)
+        auto weight1 = [a](float x) { return ((a + 2) * x - (a + 3)) * x * x + 1; };
+        auto weight2 = [a](float x) { return ((a * x - 5 * a) * x + 8 * a) * x - 4 * a; };
+        auto bicubic = [=](float p0, float p1, float p2, float p3, float x) {
+            const float w0 = weight2(x + 1);
+            const float w1 = weight1(x + 0);
+            const float w2 = weight1(1 - x);
+            const float w3 = weight2(2 - x);
+            return p0*w0 + p1*w1 + p2*w2 + p3*w3;
+        };
+        for (int64_t i3 = 0; i3 < ne3; i3++) {
+            const int64_t i03 = i3 / sf3;
+            for (int64_t i2 = ith; i2 < ne2; i2 += nth) {
+                const int64_t i02 = i2 / sf2;
+                for (int64_t i1 = 0; i1 < ne1; i1++) {
+                    const float y = ((float)i1 + pixel_offset) / sf1 - pixel_offset;
+                    const int64_t y0 = (int64_t)floorf(y);
+                    const float dy = y - (float)y0;
+                    for (int64_t i0 = 0; i0 < ne0; i0++) {
+                        const float x = ((float)i0 + pixel_offset) / sf0 - pixel_offset;
+                        const int64_t x0 = (int64_t)floorf(x);
+                        const float dx = x - (float)x0;
+                        auto p = [=](int64_t x_off, int64_t y_off) -> float {
+                            int64_t i00 = std::max(int64_t(0), std::min(x0 + x_off, ne00 - 1));
+                            int64_t i01 = std::max(int64_t(0), std::min(y0 + y_off, ne01 - 1));
+                            return *(const float *)((const char *)src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
+                        };
+                        const float val = bicubic(
+                            bicubic(p(-1,-1), p(0,-1), p(1,-1), p(2,-1), dx),
+                            bicubic(p(-1, 0), p(0, 0), p(1, 0), p(2, 0), dx),
+                            bicubic(p(-1, 1), p(0, 1), p(1, 1), p(2, 1), dx),
+                            bicubic(p(-1, 2), p(0, 2), p(1, 2), p(2, 2), dx), dy);
                         float * y_dst = (float *)((char *)dst->data + i0*nb0 + i1*nb1 + i2*nb2 + i3*nb3);
                         *y_dst = val;
                     }
@@ -7909,10 +7972,10 @@ void ggml_compute_forward_argsort(
 // ggml_compute_forward_flash_attn_ext
-static void ggml_compute_forward_flash_attn_ext_f16(
+static void ggml_compute_forward_flash_attn_ext_f16_one_chunk(
         const ggml_compute_params * params,
-        ggml_tensor * dst) {
+        ggml_tensor * dst,
+        int ir0, int ir1) {
     const ggml_tensor * q     = dst->src[0];
     const ggml_tensor * k     = dst->src[1];
     const ggml_tensor * v     = dst->src[2];
@@ -7928,9 +7991,6 @@ static void ggml_compute_forward_flash_attn_ext_f16(
     GGML_TENSOR_LOCALS(int64_t, ne,  dst, ne)
     GGML_TENSOR_LOCALS(size_t,  nb,  dst, nb)
-    const int ith = params->ith;
-    const int nth = params->nth;
     const int64_t DK = nek0;
     const int64_t DV = nev0;
     const int64_t N  = neq1;
@@ -7964,16 +8024,6 @@ static void ggml_compute_forward_flash_attn_ext_f16(
     // parallelize by q rows using ggml_vec_dot_f32
-    // total rows in q
-    const int nr = neq1*neq2*neq3;
-    // rows per thread
-    const int dr = (nr + nth - 1)/nth;
-    // row range for this thread
-    const int ir0 = dr*ith;
-    const int ir1 = MIN(ir0 + dr, nr);
     float scale         = 1.0f;
     float max_bias      = 0.0f;
     float logit_softcap = 0.0f;
@@ -8000,6 +8050,8 @@ static void ggml_compute_forward_flash_attn_ext_f16(
     GGML_ASSERT((                            q_to_vec_dot) && "fattn: unsupported K-type");
     GGML_ASSERT((v->type == GGML_TYPE_F32 || v_to_float  ) && "fattn: unsupported V-type");
+    int ith = params->ith;
     // loop over n_batch and n_head
     for (int ir = ir0; ir < ir1; ++ir) {
         // q indices
@@ -8147,6 +8199,91 @@ static void ggml_compute_forward_flash_attn_ext_f16(
     }
 }
+static void ggml_compute_forward_flash_attn_ext_f16(
+        const ggml_compute_params * params,
+        ggml_tensor * dst) {
+    const ggml_tensor * q     = dst->src[0];
+    const ggml_tensor * k     = dst->src[1];
+    const ggml_tensor * v     = dst->src[2];
+    GGML_TENSOR_LOCALS(int64_t, neq, q,   ne)
+    GGML_TENSOR_LOCALS(size_t,  nbq, q,   nb)
+    GGML_TENSOR_LOCALS(int64_t, nek, k,   ne)
+    GGML_TENSOR_LOCALS(size_t,  nbk, k,   nb)
+    GGML_TENSOR_LOCALS(int64_t, nev, v,   ne)
+    GGML_TENSOR_LOCALS(size_t,  nbv, v,   nb)
+    GGML_TENSOR_LOCALS(int64_t, ne,  dst, ne)
+    GGML_TENSOR_LOCALS(size_t,  nb,  dst, nb)
+    const int64_t DK = nek0;
+    const int64_t DV = nev0;
+    const int64_t N  = neq1;
+    GGML_ASSERT(ne0 == DV);
+    GGML_ASSERT(ne2 == N);
+    // input tensor rows must be contiguous
+    GGML_ASSERT(nbq0 == ggml_type_size(q->type));
+    GGML_ASSERT(nbk0 == ggml_type_size(k->type));
+    GGML_ASSERT(nbv0 == ggml_type_size(v->type));
+    GGML_ASSERT(neq0 == DK);
+    GGML_ASSERT(nek0 == DK);
+    GGML_ASSERT(nev0 == DV);
+    GGML_ASSERT(neq1 == N);
+    // dst cannot be transposed or permuted
+    GGML_ASSERT(nb0 == sizeof(float));
+    GGML_ASSERT(nb0 <= nb1);
+    GGML_ASSERT(nb1 <= nb2);
+    GGML_ASSERT(nb2 <= nb3);
+    // parallelize by q rows using ggml_vec_dot_f32
+    // total rows in q
+    const int64_t nr = neq1*neq2*neq3;
+    // rows per thread
+    const int ith = params->ith;
+    const int nth = params->nth;
+    // disable for NUMA
+    const bool disable_chunking = ggml_is_numa();
+    // 4x chunks per thread
+    int nth_scaled = nth * 4;
+    int64_t chunk_size = (nr + nth_scaled - 1) / nth_scaled;
+    int64_t nchunk     = (nr + chunk_size - 1) / chunk_size;
+    if (nth == 1 || nchunk < nth || disable_chunking) {
+        nchunk = nth;
+    }
+    if (ith == 0) {
+        // Every thread starts at ith, so the first unprocessed chunk is nth.  This save a bit of coordination right at the start.
+        ggml_threadpool_chunk_set(params->threadpool, nth);
+    }
+    ggml_barrier(params->threadpool);
+    // The number of elements in each chunk
+    const int64_t dr = (nr + nchunk - 1) / nchunk;
+    // The first chunk comes from our thread_id, the rest will get auto-assigned.
+    int current_chunk = ith;
+    while (current_chunk < nchunk) {
+        const int64_t ir0 = dr * current_chunk;
+        const int64_t ir1 = MIN(ir0 + dr, nr);
+        ggml_compute_forward_flash_attn_ext_f16_one_chunk(params, dst, ir0, ir1);
+        current_chunk = ggml_threadpool_chunk_add(params->threadpool, 1);
+    }
+}
 void ggml_compute_forward_flash_attn_ext(
         const ggml_compute_params * params,
         ggml_tensor * dst) {

package/src/llama.cpp/ggml/src/ggml-cpu/repack.cpp CHANGED Viewed

@@ -1600,6 +1600,32 @@ template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS, ggml_type PAR
         return false;
     }
+    void forward_mul_mat_one_chunk(ggml_compute_params * params, ggml_tensor * op, int64_t src0_start, int64_t src0_end) {
+        const ggml_tensor * src0 = op->src[0];
+        const ggml_tensor * src1 = op->src[1];
+        ggml_tensor *       dst  = op;
+        GGML_TENSOR_BINARY_OP_LOCALS
+        const void * src1_wdata      = params->wdata;
+        const size_t src1_col_stride = ggml_row_size(PARAM_TYPE, ne10);
+        // If there are more than three rows in src1, use gemm; otherwise, use gemv.
+        if (ne11 > 3) {
+            gemm<BLOC_TYPE, INTER_SIZE, NB_COLS, PARAM_TYPE>(ne00,
+                    (float *) ((char *) dst->data) + src0_start, ne01,
+                    (const char *) src0->data + src0_start * nb01,
+                    (const char *) src1_wdata, ne11 - ne11 % 4, src0_end - src0_start);
+        }
+        for (int iter = ne11 - ne11 % 4; iter < ne11; iter++) {
+            gemv<BLOC_TYPE, INTER_SIZE, NB_COLS, PARAM_TYPE>(ne00,
+                    (float *) ((char *) dst->data + (iter * nb1)) + src0_start, ne01,
+                    (const char *) src0->data + src0_start * nb01,
+                    (const char *) src1_wdata + (src1_col_stride * iter), 1,
+                    src0_end - src0_start);
+        }
+    }
     void forward_mul_mat(ggml_compute_params * params, ggml_tensor * op) {
         const ggml_tensor * src0 = op->src[0];
         const ggml_tensor * src1 = op->src[1];
@@ -1643,31 +1669,62 @@ template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS, ggml_type PAR
             from_float((float *) ((char *) src1->data + i11 * nb11), (void *) (wdata + i11 * nbw1), ne10);
         }
-        ggml_barrier(params->threadpool);
+        // disable for NUMA
+        const bool disable_chunking = ggml_is_numa();
-        const void * src1_wdata      = params->wdata;
-        const size_t src1_col_stride = ggml_row_size(PARAM_TYPE, ne10);
-        int64_t      src0_start      = (ith * ne01) / nth;
-        int64_t      src0_end        = ((ith + 1) * ne01) / nth;
-        src0_start = (src0_start % NB_COLS) ? src0_start + NB_COLS - (src0_start % NB_COLS) : src0_start;
-        src0_end   = (src0_end   % NB_COLS) ? src0_end   + NB_COLS - (src0_end   % NB_COLS) : src0_end;
-        if (src0_start >= src0_end) {
-            return;
+        // 4x chunks per thread
+        int64_t nr = ggml_nrows(op->src[0]);
+        int nth_scaled = nth * 4;
+        int64_t chunk_size = (nr + nth_scaled - 1) / nth_scaled;
+        int64_t nchunk     = (nr + chunk_size - 1) / chunk_size;
+        // Ensure minimum chunk size to avoid alignment issues with high thread counts
+        // Minimum chunk size should be at least NB_COLS to prevent overlapping chunks after alignment
+        const int64_t min_chunk_size = NB_COLS;
+        if (nchunk > 0 && (nr / nchunk) < min_chunk_size && nr >= min_chunk_size) {
+            nchunk = (nr + min_chunk_size - 1) / min_chunk_size;
         }
-        // If there are more than three rows in src1, use gemm; otherwise, use gemv.
-        if (ne11 > 3) {
-            gemm<BLOC_TYPE, INTER_SIZE, NB_COLS, PARAM_TYPE>(ne00,
-                    (float *) ((char *) dst->data) + src0_start, ne01,
-                    (const char *) src0->data + src0_start * nb01,
-                    (const char *) src1_wdata, ne11 - ne11 % 4, src0_end - src0_start);
+        if (nth == 1 || nchunk < nth || disable_chunking) {
+            nchunk = nth;
         }
-        for (int iter = ne11 - ne11 % 4; iter < ne11; iter++) {
-            gemv<BLOC_TYPE, INTER_SIZE, NB_COLS, PARAM_TYPE>(ne00,
-                    (float *) ((char *) dst->data + (iter * nb1)) + src0_start, ne01,
-                    (const char *) src0->data + src0_start * nb01,
-                    (const char *) src1_wdata + (src1_col_stride * iter), 1,
-                    src0_end - src0_start);
+        // Ensure nchunk doesn't exceed the number of rows divided by minimum chunk size
+        // This prevents creating too many tiny chunks that could overlap after alignment
+        const int64_t max_nchunk = (nr + min_chunk_size - 1) / min_chunk_size;
+        if (nchunk > max_nchunk) {
+            nchunk = max_nchunk;
+        }
+        if (ith == 0) {
+            // Every thread starts at ith, so the first unprocessed chunk is nth.  This save a bit of coordination right at the start.
+            ggml_threadpool_chunk_set(params->threadpool, nth);
+        }
+        ggml_barrier(params->threadpool);
+        // The first chunk comes from our thread_id, the rest will get auto-assigned.
+        int current_chunk = ith;
+        while (current_chunk < nchunk) {
+            int64_t src0_start = (current_chunk * ne01) / nchunk;
+            int64_t src0_end   = ((current_chunk + 1) * ne01) / nchunk;
+            // Align boundaries to NB_COLS - round up to ensure all data is included
+            // The chunk size limiting above ensures chunks are large enough to prevent overlaps
+            src0_start = (src0_start % NB_COLS) ? src0_start + NB_COLS - (src0_start % NB_COLS) : src0_start;
+            src0_end   = (src0_end   % NB_COLS) ? src0_end   + NB_COLS - (src0_end   % NB_COLS) : src0_end;
+            if (src0_end > ne01) {
+                src0_end = ne01;
+            }
+            if (src0_start >= src0_end) {
+                break;
+            }
+            forward_mul_mat_one_chunk(params, dst, src0_start, src0_end);
+            current_chunk = ggml_threadpool_chunk_add(params->threadpool, 1);
         }
     }
@@ -1772,8 +1829,12 @@ template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS, ggml_type PAR
             int64_t src0_cur_start = (ith * ne01) / nth;
             int64_t src0_cur_end   = ((ith + 1) * ne01) / nth;
+            // Align boundaries to NB_COLS - round up to ensure all data is included
             src0_cur_start = (src0_cur_start % NB_COLS) ? src0_cur_start + NB_COLS - (src0_cur_start % NB_COLS) : src0_cur_start;
             src0_cur_end   = (src0_cur_end   % NB_COLS) ? src0_cur_end   + NB_COLS - (src0_cur_end   % NB_COLS) : src0_cur_end;
+            if (src0_cur_end > ne01) {
+                src0_cur_end = ne01;
+            }
             if (src0_cur_start >= src0_cur_end) {
                 return;

package/src/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h CHANGED Viewed

@@ -956,7 +956,7 @@ do {                                                              \
 #define GGML_F32Cx8          __m256
 #define GGML_F32Cx8_ZERO    (__m256)__lasx_xvldi(0)
-#define GGML_F32Cx8_SET1(x) (__m256)__lasx_xvreplgr2vr_w((x))
+#define GGML_F32Cx8_SET1(x) (__m256)__lasx_xvreplfr2vr_s((x))
 static inline __m256 __lasx_f32cx8_load(const ggml_fp16_t * x) {
     __m256i a;
@@ -999,34 +999,34 @@ static inline void __lasx_f32cx8_store(ggml_fp16_t * x, __m256 y) {
 #define GGML_F32x4         __m128
 #define GGML_F32x4_ZERO    (__m128)__lsx_vldi(0)
-#define GGML_F32x4_SET1(x) (__m128)__lsx_vinsgr2vr_w(__lsx_vldi(0),(x), 0)
+#define GGML_F32x4_SET1(x) (__m128)__lsx_vreplfr2vr_s((x))
 #define GGML_F32x4_LOAD(x) (__m128)__lsx_vld((x), 0)
 #define GGML_F32x4_STORE(x, y)   __lsx_vst(y, x, 0)
 #define GGML_F32x4_FMA(a, b, c) __lsx_vfmadd_s(b, c, a)
 #define GGML_F32x4_ADD     __lsx_vfadd_s
 #define GGML_F32x4_MUL     __lsx_vfmul_s
-#define GGML_F32x4_REDUCE(res, x)                                                     \
-{                                                                                     \
-    int offset = GGML_F32_ARR >> 1;                                                   \
-    for (int i = 0; i < offset; ++i) {                                                \
-        x[i] = __lsx_vfadd_s(x[i], x[offset + i]);                                    \
-    }                                                                                 \
-    offset >>= 1;                                                                     \
-    for (int i = 0; i < offset; ++i) {                                                \
-        x[i] = __lsx_vfadd_s(x[i], x[offset + i]);                                    \
-    }                                                                                 \
-    offset >>= 1;                                                                     \
-    for (int i = 0; i < offset; ++i) {                                                \
-        x[i] = __lsx_vfadd_s(x[i], x[offset + i]);                                    \
-    }                                                                                 \
-    __m128i tmp     = __lsx_vsrli_d((__m128i) x[0], 32);                              \
-    tmp             = (__m128i) __lsx_vfadd_s((__m128) tmp, x[0]);                    \
-    tmp             = __lsx_vpickev_w(__lsx_vldi(0), tmp);                            \
-    const __m128 t0 = (__m128)__lsx_vshuf4i_w(tmp, 0x88);                                     \
-    tmp             = __lsx_vsrli_d((__m128i) t0, 32);                                \
-    tmp             = (__m128i) __lsx_vfadd_s((__m128) tmp, t0);                      \
-    tmp             = __lsx_vpickev_w(__lsx_vldi(0), tmp);                            \
-    res             = (ggml_float) __lsx_vpickve2gr_w(__lsx_vshuf4i_w(tmp, 0x88), 0); \
+#define GGML_F32x4_REDUCE(res, x)                               \
+{                                                               \
+    int offset = GGML_F32_ARR >> 1;                             \
+    for (int i = 0; i < offset; ++i) {                          \
+        x[i] = __lsx_vfadd_s(x[i], x[offset+i]);                \
+    }                                                           \
+    offset >>= 1;                                               \
+    for (int i = 0; i < offset; ++i) {                          \
+        x[i] = __lsx_vfadd_s(x[i], x[offset+i]);                \
+    }                                                           \
+    offset >>= 1;                                               \
+    for (int i = 0; i < offset; ++i) {                          \
+        x[i] = __lsx_vfadd_s(x[i], x[offset+i]);                \
+    }                                                           \
+    __m128i t0 = __lsx_vpickev_w((__m128i)x[0], (__m128i)x[0]); \
+    __m128i t1 = __lsx_vpickod_w((__m128i)x[0], (__m128i)x[0]); \
+    __m128 t2 = __lsx_vfadd_s((__m128)t0, (__m128)t1);          \
+    __m128i t3 = __lsx_vpickev_w((__m128i)t2, (__m128i)t2);     \
+    __m128i t4 = __lsx_vpickod_w((__m128i)t2, (__m128i)t2);     \
+    __m128 t5 = __lsx_vfadd_s((__m128)t3, (__m128)t4);          \
+    res = (ggml_float) ((v4f32)t5)[0];                          \
 }
 #define GGML_F32_VEC        GGML_F32x4
@@ -1068,7 +1068,7 @@ static inline void __lsx_f16x4_store(ggml_fp16_t * x, __m128 y) {
 #define GGML_F32Cx4             __m128
 #define GGML_F32Cx4_ZERO        (__m128)__lsx_vldi(0)
-#define GGML_F32Cx4_SET1(x)     (__m128)__lsx_vinsgr2vr_w(__lsx_vldi(0),(x), 0)
+#define GGML_F32Cx4_SET1(x)     (__m128)__lsx_vreplfr2vr_s((x))
 #define GGML_F32Cx4_LOAD(x)     (__m128)__lsx_f16x4_load(x)
 #define GGML_F32Cx4_STORE(x, y) __lsx_f16x4_store(x, y)
 #define GGML_F32Cx4_FMA         GGML_F32x4_FMA

package/src/llama.cpp/include/llama.h CHANGED Viewed

@@ -83,6 +83,7 @@ extern "C" {
         LLAMA_ROPE_TYPE_NORM   = 0,
         LLAMA_ROPE_TYPE_NEOX   = GGML_ROPE_TYPE_NEOX,
         LLAMA_ROPE_TYPE_MROPE  = GGML_ROPE_TYPE_MROPE,
+        LLAMA_ROPE_TYPE_IMROPE = GGML_ROPE_TYPE_IMROPE,
         LLAMA_ROPE_TYPE_VISION = GGML_ROPE_TYPE_VISION,
     };
@@ -460,7 +461,11 @@ extern "C" {
     LLAMA_API bool llama_supports_gpu_offload(void);
     LLAMA_API bool llama_supports_rpc        (void);
+    // NOTE: After creating a llama_context, it is recommended to query the actual values using these functions
+    //       In some cases the requested values via llama_context_params may differ from the actual values used by the context
+    //       ref: https://github.com/ggml-org/llama.cpp/pull/17046#discussion_r2503085732
     LLAMA_API uint32_t llama_n_ctx      (const struct llama_context * ctx);
+    LLAMA_API uint32_t llama_n_ctx_seq  (const struct llama_context * ctx);
     LLAMA_API uint32_t llama_n_batch    (const struct llama_context * ctx);
     LLAMA_API uint32_t llama_n_ubatch   (const struct llama_context * ctx);
     LLAMA_API uint32_t llama_n_seq_max  (const struct llama_context * ctx);
@@ -481,6 +486,7 @@ extern "C" {
     LLAMA_API int32_t llama_model_n_ctx_train(const struct llama_model * model);
     LLAMA_API int32_t llama_model_n_embd     (const struct llama_model * model);
+    LLAMA_API int32_t llama_model_n_embd_inp (const struct llama_model * model);
     LLAMA_API int32_t llama_model_n_layer    (const struct llama_model * model);
     LLAMA_API int32_t llama_model_n_head     (const struct llama_model * model);
     LLAMA_API int32_t llama_model_n_head_kv  (const struct llama_model * model);
@@ -584,7 +590,7 @@ extern "C" {
     LLAMA_API int32_t llama_adapter_meta_val_str_by_index(const struct llama_adapter_lora * adapter, int32_t i, char * buf, size_t buf_size);
     // Manually free a LoRA adapter
-    // Note: loaded adapters will be free when the associated model is deleted
+    // NOTE: loaded adapters will be free when the associated model is deleted
     LLAMA_API void llama_adapter_lora_free(struct llama_adapter_lora * adapter);
     // Get the invocation tokens if the current lora is an alora
@@ -1110,8 +1116,6 @@ extern "C" {
     //        // sample from the logits of the last token in the batch
     //        const llama_token id = llama_sampler_sample(smpl, ctx, -1);
     //
-    //        // accepting the token updates the internal state of certain samplers (e.g. grammar, repetition, etc.)
-    //        llama_sampler_accept(smpl, id);
     //        ...
     //    }
     //

package/src/llama.cpp/src/CMakeLists.txt CHANGED Viewed

@@ -35,6 +35,101 @@ add_library(llama
             unicode-data.cpp
             unicode.cpp
             unicode.h
+            models/apertus.cpp
+            models/arcee.cpp
+            models/arctic.cpp
+            models/arwkv7.cpp
+            models/baichuan.cpp
+            models/bailingmoe.cpp
+            models/bailingmoe2.cpp
+            models/bert.cpp
+            models/bitnet.cpp
+            models/bloom.cpp
+            models/chameleon.cpp
+            models/chatglm.cpp
+            models/codeshell.cpp
+            models/cogvlm.cpp
+            models/cohere2-iswa.cpp
+            models/command-r.cpp
+            models/dbrx.cpp
+            models/deci.cpp
+            models/deepseek.cpp
+            models/deepseek2.cpp
+            models/dots1.cpp
+            models/dream.cpp
+            models/ernie4-5-moe.cpp
+            models/ernie4-5.cpp
+            models/exaone.cpp
+            models/exaone4.cpp
+            models/falcon-h1.cpp
+            models/falcon.cpp
+            models/gemma-embedding.cpp
+            models/gemma.cpp
+            models/gemma2-iswa.cpp
+            models/gemma3-iswa.cpp
+            models/gemma3n-iswa.cpp
+            models/glm4-moe.cpp
+            models/glm4.cpp
+            models/gpt2.cpp
+            models/gptneox.cpp
+            models/granite-hybrid.cpp
+            models/granite.cpp
+            models/grok.cpp
+            models/grovemoe.cpp
+            models/hunyuan-dense.cpp
+            models/hunyuan-moe.cpp
+            models/internlm2.cpp
+            models/jais.cpp
+            models/jamba.cpp
+            models/lfm2.cpp
+            models/llada-moe.cpp
+            models/llada.cpp
+            models/llama-iswa.cpp
+            models/llama.cpp
+            models/mamba.cpp
+            models/minicpm3.cpp
+            models/minimax-m2.cpp
+            models/mpt.cpp
+            models/nemotron-h.cpp
+            models/nemotron.cpp
+            models/neo-bert.cpp
+            models/olmo.cpp
+            models/olmo2.cpp
+            models/olmoe.cpp
+            models/openai-moe-iswa.cpp
+            models/openelm.cpp
+            models/orion.cpp
+            models/pangu-embedded.cpp
+            models/phi2.cpp
+            models/phi3.cpp
+            models/plamo.cpp
+            models/plamo2.cpp
+            models/plm.cpp
+            models/qwen.cpp
+            models/qwen2.cpp
+            models/qwen2moe.cpp
+            models/qwen2vl.cpp
+            models/qwen3.cpp
+            models/qwen3vl.cpp
+            models/qwen3vl-moe.cpp
+            models/qwen3moe.cpp
+            models/refact.cpp
+            models/rwkv6-base.cpp
+            models/rwkv6.cpp
+            models/rwkv6qwen2.cpp
+            models/rwkv7-base.cpp
+            models/rwkv7.cpp
+            models/seed-oss.cpp
+            models/smallthinker.cpp
+            models/smollm3.cpp
+            models/stablelm.cpp
+            models/starcoder.cpp
+            models/starcoder2.cpp
+            models/t5-dec.cpp
+            models/t5-enc.cpp
+            models/wavtokenizer-dec.cpp
+            models/xverse.cpp
+            models/graph-context-mamba.cpp
             )
 target_include_directories(llama PRIVATE .)