npm - @fugood/llama.node - Versions diffs - 1.0.2 → 1.0.4 - Mend

@fugood/llama.node 1.0.2 → 1.0.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (50) hide show

package/package.json +14 -14
package/src/llama.cpp/CMakeLists.txt +0 -1
package/src/llama.cpp/common/CMakeLists.txt +4 -5
package/src/llama.cpp/common/arg.cpp +44 -0
package/src/llama.cpp/common/common.cpp +22 -6
package/src/llama.cpp/common/common.h +15 -1
package/src/llama.cpp/ggml/CMakeLists.txt +10 -2
package/src/llama.cpp/ggml/include/ggml-webgpu.h +19 -0
package/src/llama.cpp/ggml/include/ggml.h +104 -10
package/src/llama.cpp/ggml/src/CMakeLists.txt +1 -1
package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +6 -1
package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +12 -1
package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +343 -1094
package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +749 -163
package/src/llama.cpp/ggml/src/ggml-cpu/ops.h +5 -0
package/src/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +1 -1
package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +12 -9
package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +88 -9
package/src/llama.cpp/include/llama.h +13 -47
package/src/llama.cpp/src/llama-arch.cpp +298 -3
package/src/llama.cpp/src/llama-arch.h +22 -1
package/src/llama.cpp/src/llama-batch.cpp +103 -71
package/src/llama.cpp/src/llama-batch.h +31 -18
package/src/llama.cpp/src/llama-chat.cpp +59 -1
package/src/llama.cpp/src/llama-chat.h +3 -0
package/src/llama.cpp/src/llama-context.cpp +134 -95
package/src/llama.cpp/src/llama-context.h +13 -16
package/src/llama.cpp/src/llama-cparams.h +3 -2
package/src/llama.cpp/src/llama-graph.cpp +279 -180
package/src/llama.cpp/src/llama-graph.h +183 -122
package/src/llama.cpp/src/llama-hparams.cpp +47 -1
package/src/llama.cpp/src/llama-hparams.h +12 -1
package/src/llama.cpp/src/llama-kv-cache-unified-iswa.cpp +38 -22
package/src/llama.cpp/src/llama-kv-cache-unified-iswa.h +7 -2
package/src/llama.cpp/src/llama-kv-cache-unified.cpp +849 -304
package/src/llama.cpp/src/llama-kv-cache-unified.h +143 -47
package/src/llama.cpp/src/llama-kv-cells.h +62 -10
package/src/llama.cpp/src/llama-memory-hybrid.cpp +10 -4
package/src/llama.cpp/src/llama-memory-hybrid.h +3 -1
package/src/llama.cpp/src/llama-memory-recurrent.cpp +21 -11
package/src/llama.cpp/src/llama-memory.cpp +17 -0
package/src/llama.cpp/src/llama-memory.h +3 -0
package/src/llama.cpp/src/llama-model.cpp +3373 -743
package/src/llama.cpp/src/llama-model.h +20 -4
package/src/llama.cpp/src/llama-quant.cpp +2 -2
package/src/llama.cpp/src/llama-vocab.cpp +376 -10
package/src/llama.cpp/src/llama-vocab.h +43 -0
package/src/llama.cpp/src/unicode.cpp +207 -0
package/src/llama.cpp/src/unicode.h +2 -0
package/src/llama.cpp/ggml/include/ggml-kompute.h +0 -50

package/src/llama.cpp/src/llama-batch.cpp CHANGED Viewed

@@ -27,6 +27,7 @@ bool llama_batch_allocr::init(
         const llama_vocab & vocab,
         const llama_memory_i * memory,
         uint32_t n_embd,
+        uint32_t n_seq_max,
         bool output_all) {
     clear();
@@ -40,6 +41,11 @@ bool llama_batch_allocr::init(
     // validate input batch
     //
+    if (n_seq_max > LLAMA_MAX_SEQ) {
+        LLAMA_LOG_ERROR("%s: n_seq_max = %d > %d\n", __func__, n_seq_max, LLAMA_MAX_SEQ);
+        return false;
+    }
     if (batch.token) {
         for (int32_t i = 0; i < batch.n_tokens; ++i) {
             if (batch.token[i] < 0 || (uint32_t) batch.token[i] >= vocab.n_tokens()) {
@@ -52,8 +58,8 @@ bool llama_batch_allocr::init(
     if (batch.seq_id) {
         for (int32_t i = 0; i < batch.n_tokens; ++i) {
             for (int32_t s = 0; s < batch.n_seq_id[i]; ++s) {
-                if (batch.seq_id && (batch.seq_id[i][s] < 0 || batch.seq_id[i][s] >= LLAMA_MAX_SEQ)) {
-                    LLAMA_LOG_ERROR("%s: invalid seq_id[%d][%d] = %d > %d\n", __func__, i, s, batch.seq_id[i][s], LLAMA_MAX_SEQ);
+                if (batch.seq_id && (batch.seq_id[i][s] < 0 || batch.seq_id[i][s] >= (llama_seq_id) n_seq_max)) {
+                    LLAMA_LOG_ERROR("%s: invalid seq_id[%d][%d] = %d > %d\n", __func__, i, s, batch.seq_id[i][s], (llama_seq_id) n_seq_max);
                     return false;
                 }
             }
@@ -86,7 +92,7 @@ bool llama_batch_allocr::init(
         // initialize the starting position for each sequence based on the positions in the memory
         llama_pos p0[LLAMA_MAX_SEQ];
-        for (int32_t s = 0; s < LLAMA_MAX_SEQ; ++s) {
+        for (uint32_t s = 0; s < n_seq_max; ++s) {
             if (!memory) {
                 // if no memory -> start from 0
                 p0[s] = 0;
@@ -143,13 +149,16 @@ bool llama_batch_allocr::init(
     // compute stats
     //
-    this->n_embd = n_embd;
+    this->n_embd    = n_embd;
+    this->n_seq_max = n_seq_max;
     // count the outputs in this batch
     for (int32_t i = 0; i < batch.n_tokens; ++i) {
         n_outputs += batch.logits[i] != 0;
     }
+    has_cpl = false;
     // determine coupled sequences
     // these are pairs of sequences that have at least one token in the input batch that is assigned to both of them
     for (int32_t i = 0; i < batch.n_tokens; ++i) {
@@ -166,6 +175,8 @@ bool llama_batch_allocr::init(
                 // note: tracking the other way around is not necessary for now
                 //seq_cpl[s0][s1] = true;
+                has_cpl = true;
             }
         }
     }
@@ -187,7 +198,7 @@ bool llama_batch_allocr::init(
             seq_set_map[cur].push_back(i);
         }
-        for (int32_t s = 0; s < LLAMA_MAX_SEQ; ++s) {
+        for (uint32_t s = 0; s < n_seq_max; ++s) {
             if (seq_set_unq.test(s)) {
                 seq_idx[s] = seq_id_unq.size();
                 seq_id_unq.push_back(s);
@@ -199,7 +210,7 @@ bool llama_batch_allocr::init(
         LLAMA_LOG_DEBUG("%s: input batch info:\n", __func__);
         llama_ubatch ubatch {
-            /*.equal_seqs   =*/ false,
+            /*.b_equal_seqs =*/ false,
             /*.n_tokens     =*/ (uint32_t) batch.n_tokens,
             /*.n_seq_tokens =*/ (uint32_t) 1,
             /*.n_seqs       =*/ (uint32_t) batch.n_tokens,
@@ -212,6 +223,7 @@ bool llama_batch_allocr::init(
             /*.seq_id_unq   =*/ this->seq_id_unq.data(),
             /*.seq_idx      =*/ this->seq_idx.data(),
             /*.output       =*/ batch.logits,
+            /*.data         =*/ {},
         };
         ubatch_print(ubatch, debug);
@@ -239,7 +251,7 @@ bool llama_batch_allocr::init(
     // consistency checks
     //
-    for (int32_t s = 0; s < LLAMA_MAX_SEQ; ++s) {
+    for (uint32_t s = 0; s < n_seq_max; ++s) {
         if (seq_pos[s].empty()) {
             continue;
         }
@@ -282,8 +294,8 @@ bool llama_batch_allocr::init(
     }
     if (memory) {
-        for (int32_t s0 = 0; s0 < LLAMA_MAX_SEQ; ++s0) {
-            for (int32_t s1 = 0; s1 < LLAMA_MAX_SEQ; ++s1) {
+        for (uint32_t s0 = 0; s0 < n_seq_max; ++s0) {
+            for (uint32_t s1 = 0; s1 < n_seq_max; ++s1) {
                 if (seq_cpl[s0][s1]) {
                     if (memory->seq_pos_min(s0) != memory->seq_pos_min(s1) ||
                         memory->seq_pos_max(s0) != memory->seq_pos_max(s1)) {
@@ -314,12 +326,12 @@ bool llama_batch_allocr::init(
     //
     {
         seq_set_t cur_seq_set[LLAMA_MAX_SEQ];
-        for (int32_t s = 0; s < LLAMA_MAX_SEQ; ++s) {
+        for (uint32_t s = 0; s < n_seq_max; ++s) {
             cur_seq_set[s].set();
         }
         llama_pos cur_seq_pos[LLAMA_MAX_SEQ];
-        for (int32_t s = 0; s < LLAMA_MAX_SEQ; ++s) {
+        for (uint32_t s = 0; s < n_seq_max; ++s) {
             cur_seq_pos[s] = -1;
         }
@@ -355,39 +367,38 @@ llama_ubatch llama_batch_allocr::ubatch_reserve(uint32_t n_seq_tokens, uint32_t
     clear();
     split_reset();
-    ubatches.emplace_back();
-    auto & ubatch = ubatches.back();
+    auto udata = std::make_shared<llama_ubatch::data_t>();
-    ubatch.token     .resize(n_tokens);
-    ubatch.embd      .clear();
-    ubatch.pos       .resize(n_tokens);
-    ubatch.n_seq_id  .resize(n_tokens);
-    ubatch.seq_id    .resize(n_tokens);
-    ubatch.seq_id_unq.resize(0);
-    ubatch.seq_idx   .resize(LLAMA_MAX_SEQ, -1);
-    ubatch.output    .resize(n_tokens);
+    udata->token     .resize(n_tokens);
+    udata->embd      .clear();
+    udata->pos       .resize(n_tokens);
+    udata->n_seq_id  .resize(n_tokens);
+    udata->seq_id    .resize(n_tokens);
+    udata->seq_id_unq.resize(0);
+    udata->seq_idx   .resize(LLAMA_MAX_SEQ, -1);
+    udata->output    .resize(n_tokens);
     for (uint32_t s = 0; s < n_seqs; ++s) {
-        ubatch.seq_idx[s] = s;
-        ubatch.seq_id_unq.push_back(s);
+        udata->seq_idx[s] = s;
+        udata->seq_id_unq.push_back(s);
     }
     llama_ubatch res {
-        /*.equal_seqs   =*/ true,
+        /*.b_equal_seqs =*/ true,
         /*.n_tokens     =*/ n_tokens,
         /*.n_seq_tokens =*/ n_seq_tokens,
         /*.n_seqs       =*/ n_seqs,
         /*.n_seqs_unq   =*/ n_seqs,
-        /*.token        =*/ ubatch.token.data(),
+        /*.token        =*/ udata->token.data(),
         /*.embd         =*/ nullptr,
-        /*.pos          =*/ ubatch.pos.data(),
-        /*.n_seq_id     =*/ ubatch.n_seq_id.data(),
-        /*.seq_id       =*/ ubatch.seq_id.data(),
-        /*.seq_id_unq   =*/ ubatch.seq_id_unq.data(),
-        /*.seq_idx      =*/ ubatch.seq_idx.data(),
-        /*.output       =*/ ubatch.output.data(),
+        /*.pos          =*/ udata->pos.data(),
+        /*.n_seq_id     =*/ udata->n_seq_id.data(),
+        /*.seq_id       =*/ udata->seq_id.data(),
+        /*.seq_id_unq   =*/ udata->seq_id_unq.data(),
+        /*.seq_idx      =*/ udata->seq_idx.data(),
+        /*.output       =*/ udata->output.data(),
+        /*.data         =*/ std::move(udata),
     };
     return res;
@@ -405,6 +416,10 @@ uint32_t llama_batch_allocr::get_n_outputs() const {
     return n_outputs;
 }
+uint32_t llama_batch_allocr::get_n_used() const {
+    return n_used;
+}
 std::vector<int32_t> & llama_batch_allocr::get_out_ids() {
     return out_ids;
 }
@@ -420,10 +435,10 @@ llama_pos llama_batch_allocr::seq_pos_max(llama_seq_id seq_id) const {
 void llama_batch_allocr::split_reset() {
     out_ids.clear();
+    n_used = 0;
     used.clear();
     used.resize(get_n_tokens(), false);
-    ubatches.clear();
 }
 llama_ubatch llama_batch_allocr::split_simple(uint32_t n_ubatch) {
@@ -444,6 +459,7 @@ llama_ubatch llama_batch_allocr::split_simple(uint32_t n_ubatch) {
         idxs.push_back(cur_idx);
         used[cur_idx] = true;
+        ++n_used;
         ++cur_idx;
@@ -459,9 +475,17 @@ llama_ubatch llama_batch_allocr::split_simple(uint32_t n_ubatch) {
     return ubatch_add(idxs, idxs.size(), false);
 }
-llama_ubatch llama_batch_allocr::split_equal(uint32_t n_ubatch) {
+llama_ubatch llama_batch_allocr::split_equal(uint32_t n_ubatch, bool sequential) {
+    if (sequential && has_cpl) {
+        LLAMA_LOG_ERROR("%s: sequential split is not supported when there are coupled sequences in the input batch\n", __func__);
+        return {};
+    }
     std::vector<seq_set_t> cur_seq_set;
+    llama_seq_id last_seq_id = -1;
     // determine the non-overlapping sequence sets participating in this ubatch
     for (int32_t i = 0; i < batch.n_tokens; ++i) {
         if (used[i]) {
@@ -478,9 +502,16 @@ llama_ubatch llama_batch_allocr::split_equal(uint32_t n_ubatch) {
             }
         }
+        // accept only increasing sequence ids
+        if (sequential) {
+            add = add && (cur_seq_set.empty() || batch.seq_id[i][0] == last_seq_id + 1);
+        }
         if (add) {
             cur_seq_set.push_back(seq_set[i]);
+            last_seq_id = batch.seq_id[i][0];
             if (cur_seq_set.size() > n_ubatch) {
                 break;
             }
@@ -529,6 +560,7 @@ llama_ubatch llama_batch_allocr::split_equal(uint32_t n_ubatch) {
             idxs_per_seq[s].push_back(idx);
             used[idx] = true;
+            ++n_used;
             ++cur_idx[s];
         }
@@ -570,6 +602,7 @@ llama_ubatch llama_batch_allocr::split_seq(uint32_t n_ubatch) {
         idxs.push_back(cur_idx);
         used[cur_idx] = true;
+        ++n_used;
         if (idxs.size() >= n_ubatch) {
             break;
@@ -620,78 +653,77 @@ llama_ubatch llama_batch_allocr::ubatch_add(const std::vector<int32_t> & idxs, u
     assert(n_tokens%n_seqs == 0);
-    ubatches.emplace_back();
-    auto & ubatch = ubatches.back();
+    auto udata = std::make_shared<llama_ubatch::data_t>();
     const int32_t n_pos_cur = batch.embd ? n_pos_per_embd : 1;
     const int64_t n_embd_all = batch.embd ? (int64_t) n_tokens*n_embd : 0;
     const int64_t n_pos_all  =              (int64_t) n_tokens*n_pos_cur;
-    ubatch.token     .resize(n_tokens);
-    ubatch.embd      .resize(n_embd_all);
-    ubatch.pos       .resize(n_pos_all);
-    ubatch.n_seq_id  .resize(n_tokens);
-    ubatch.seq_id    .resize(n_tokens);
-    ubatch.seq_id_unq.resize(0);
-    ubatch.seq_idx   .resize(LLAMA_MAX_SEQ, -1);
-    ubatch.output    .resize(n_tokens);
+    udata->token     .resize(n_tokens);
+    udata->embd      .resize(n_embd_all);
+    udata->pos       .resize(n_pos_all);
+    udata->n_seq_id  .resize(n_tokens);
+    udata->seq_id    .resize(n_tokens);
+    udata->seq_id_unq.resize(0);
+    udata->seq_idx   .resize(LLAMA_MAX_SEQ, -1);
+    udata->output    .resize(n_tokens);
     seq_set_t seq_set_unq;
     for (size_t i = 0; i < idxs.size(); ++i) {
         if (batch.token) {
-            ubatch.token[i] = batch.token[idxs[i]];
+            udata->token[i] = batch.token[idxs[i]];
         }
         if (batch.embd) {
-            memcpy(ubatch.embd.data() + i*n_embd, batch.embd + (int64_t) idxs[i]*n_embd, n_embd*sizeof(float));
+            memcpy(udata->embd.data() + i*n_embd, batch.embd + (int64_t) idxs[i]*n_embd, n_embd*sizeof(float));
         }
         for (int j = 0; j < n_pos_cur; ++j) {
-            ubatch.pos[j*n_tokens + i] = batch.pos[j*batch.n_tokens + idxs[i]];
+            udata->pos[j*n_tokens + i] = batch.pos[j*batch.n_tokens + idxs[i]];
         }
-        ubatch.n_seq_id[i] = batch.n_seq_id[idxs[i]];
-        ubatch.seq_id[i]   = batch.seq_id[idxs[i]];
-        ubatch.output[i]   = batch.logits[idxs[i]];
+        udata->n_seq_id[i] = batch.n_seq_id[idxs[i]];
+        udata->seq_id[i]   = batch.seq_id[idxs[i]];
+        udata->output[i]   = batch.logits[idxs[i]];
-        for (int s = 0; s < ubatch.n_seq_id[i]; ++s) {
-            seq_set_unq.set(ubatch.seq_id[i][s]);
+        for (int s = 0; s < udata->n_seq_id[i]; ++s) {
+            seq_set_unq.set(udata->seq_id[i][s]);
         }
-        if (ubatch.output[i]) {
+        if (udata->output[i]) {
             out_ids.push_back(idxs[i]);
         }
     }
-    for (int32_t s = 0; s < LLAMA_MAX_SEQ; ++s) {
+    for (uint32_t s = 0; s < n_seq_max; ++s) {
         if (seq_set_unq.test(s)) {
-            ubatch.seq_idx[s] = ubatch.seq_id_unq.size();
-            ubatch.seq_id_unq.push_back(s);
+            udata->seq_idx[s] = udata->seq_id_unq.size();
+            udata->seq_id_unq.push_back(s);
         }
     }
     llama_ubatch res {
-        /*.equal_seqs   =*/ equal_seqs,
+        /*.b_equal_seqs =*/ equal_seqs,
         /*.n_tokens     =*/ n_tokens,
         /*.n_seq_tokens =*/ n_tokens/n_seqs,
         /*.n_seqs       =*/ n_seqs,
-        /*.n_seqs_unq   =*/ (uint32_t) ubatch.seq_id_unq.size(),
-        /*.token        =*/ batch.token ? ubatch.token.data() : nullptr,
-        /*.embd         =*/ batch.embd ? ubatch.embd.data() : nullptr,
-        /*.pos          =*/ ubatch.pos.data(),
-        /*.n_seq_id     =*/ ubatch.n_seq_id.data(),
-        /*.seq_id       =*/ ubatch.seq_id.data(),
-        /*.seq_id_unq   =*/ ubatch.seq_id_unq.data(),
-        /*.seq_idx      =*/ ubatch.seq_idx.data(),
-        /*.output       =*/ ubatch.output.data(),
+        /*.n_seqs_unq   =*/ (uint32_t) udata->seq_id_unq.size(),
+        /*.token        =*/ batch.token ? udata->token.data() : nullptr,
+        /*.embd         =*/ batch.embd ? udata->embd.data() : nullptr,
+        /*.pos          =*/ udata->pos.data(),
+        /*.n_seq_id     =*/ udata->n_seq_id.data(),
+        /*.seq_id       =*/ udata->seq_id.data(),
+        /*.seq_id_unq   =*/ udata->seq_id_unq.data(),
+        /*.seq_idx      =*/ udata->seq_idx.data(),
+        /*.output       =*/ udata->output.data(),
+        /*.data         =*/ std::move(udata),
     };
     if (debug > 0) {
-        LLAMA_LOG_DEBUG("%s: added ubatch %d to split:\n", __func__, (int) ubatches.size() - 1);
+        LLAMA_LOG_DEBUG("%s: added ubatch to split:\n", __func__);
         ubatch_print(res, debug);
     }
@@ -701,7 +733,7 @@ llama_ubatch llama_batch_allocr::ubatch_add(const std::vector<int32_t> & idxs, u
 void llama_batch_allocr::ubatch_print(const llama_ubatch & ubatch, int debug) {
     if (debug > 0) {
-        LLAMA_LOG_DEBUG("%s:   equal_seqs   = %d\n", __func__, ubatch.equal_seqs);
+        LLAMA_LOG_DEBUG("%s:   equal_seqs   = %d\n", __func__, ubatch.equal_seqs());
         LLAMA_LOG_DEBUG("%s:   n_tokens     = %d\n", __func__, ubatch.n_tokens);
         LLAMA_LOG_DEBUG("%s:   n_seq_tokens = %d\n", __func__, ubatch.n_seq_tokens);
         LLAMA_LOG_DEBUG("%s:   n_seqs       = %d\n", __func__, ubatch.n_seqs);

package/src/llama.cpp/src/llama-batch.h CHANGED Viewed

@@ -8,12 +8,17 @@
 #include <vector>
 #include <set>
 #include <bitset>
+#include <memory>
 #include <unordered_map>
 // keep this struct lightweight
-// it points to data in `llama_batch_allocr`
 struct llama_ubatch {
-    bool equal_seqs;
+    bool equal_seqs() const {
+        return b_equal_seqs != 0;
+    }
+    uint32_t b_equal_seqs; // note: this is a boolean, but we use an int32_t for alignment
+                           //       otherwise address sanitizer complains
     // TODO: whole_seqs for embeddings?
     uint32_t n_tokens;     // total tokens (n_seq_tokens * n_seqs)
@@ -34,6 +39,20 @@ struct llama_ubatch {
     llama_seq_id *  seq_id_unq; // [n_seqs_unq]       | s   | seq_id
     int32_t      *  seq_idx;    // [LLAMA_MAX_SEQ]    | -   | seq_idx
     int8_t       *  output;     // [n_tokens]         | i   | -
+    struct data_t {
+        std::vector<llama_token>    token;
+        std::vector<float>          embd;
+        std::vector<llama_pos>      pos;
+        std::vector<int32_t>        n_seq_id;
+        std::vector<llama_seq_id *> seq_id;
+        std::vector<llama_seq_id>   seq_id_unq;
+        std::vector<int32_t>        seq_idx;
+        std::vector<int8_t>         output;
+    };
+    // the llama_ubatch pointers above point to this data if set. otherwise - points to non-owning data
+    std::shared_ptr<data_t> data;
 };
 // a helper for sanitizing, fulfilling and splitting a batch
@@ -48,12 +67,14 @@ public:
             const llama_vocab & vocab,
             const llama_memory_i * memory,
             uint32_t n_embd,
+            uint32_t n_seq_max,
             bool output_all);
     const llama_batch & get_batch() const;
     uint32_t get_n_tokens()  const;
     uint32_t get_n_outputs() const;
+    uint32_t get_n_used()    const;
     // the array of output indices in the order they were encountered during the ubatch splitting
     std::vector<int32_t> & get_out_ids();
@@ -69,7 +90,8 @@ public:
     llama_ubatch split_simple(uint32_t n_ubatch);
     // make ubatches of equal-length sequences sets
-    llama_ubatch split_equal(uint32_t n_ubatch);
+    // if sequential == true, the tokens in the ubatch will have increasing sequential sequence ids
+    llama_ubatch split_equal(uint32_t n_ubatch, bool sequential);
     // sequence-set-wise split - each ubatch contains a single sequence-set
     llama_ubatch split_seq(uint32_t n_ubatch);
@@ -98,6 +120,7 @@ private:
     const uint32_t n_pos_per_embd;
     uint32_t n_embd;
+    uint32_t n_seq_max;
     uint32_t n_outputs;
     std::array<llama_seq_id, 1> seq_id_0 = { 0 }; // default sequence id
@@ -112,6 +135,9 @@ private:
     using pos_set_t = std::set<llama_pos>;
     using seq_cpl_t = std::vector<bool>;
+    // helper flag to quickly determine if there are any coupled sequences in the batch
+    bool has_cpl = false;
     std::vector<pos_set_t> seq_pos; // seq_pos[s]: the set of positions in sequence s
     std::vector<seq_cpl_t> seq_cpl; // seq_cpl[s0][s1]: if sequence s0 is coupled to sequence s1
@@ -125,23 +151,10 @@ private:
     // batch indices of the output
     std::vector<int32_t> out_ids;
+    uint32_t n_used;
     // used[i] indicates if token i has already been used in a previous ubatch
     std::vector<bool> used;
-    // llama_ubatch points to this data:
-    struct ubatch {
-        std::vector<llama_token>    token;
-        std::vector<float>          embd;
-        std::vector<llama_pos>      pos;
-        std::vector<int32_t>        n_seq_id;
-        std::vector<llama_seq_id *> seq_id;
-        std::vector<llama_seq_id>   seq_id_unq;
-        std::vector<int32_t>        seq_idx;
-        std::vector<int8_t>         output;
-    };
-    // current splitting state:
-    std::vector<ubatch> ubatches;
     int debug;
 };

package/src/llama.cpp/src/llama-chat.cpp CHANGED Viewed

@@ -56,6 +56,7 @@ static const std::map<std::string, llm_chat_template> LLM_CHAT_TEMPLATES = {
     { "glmedge",           LLM_CHAT_TEMPLATE_GLMEDGE           },
     { "minicpm",           LLM_CHAT_TEMPLATE_MINICPM           },
     { "exaone3",           LLM_CHAT_TEMPLATE_EXAONE_3          },
+    { "exaone4",           LLM_CHAT_TEMPLATE_EXAONE_4          },
     { "rwkv-world",        LLM_CHAT_TEMPLATE_RWKV_WORLD        },
     { "granite",           LLM_CHAT_TEMPLATE_GRANITE           },
     { "gigachat",          LLM_CHAT_TEMPLATE_GIGACHAT          },
@@ -64,6 +65,8 @@ static const std::map<std::string, llm_chat_template> LLM_CHAT_TEMPLATES = {
     { "bailing",           LLM_CHAT_TEMPLATE_BAILING           },
     { "llama4",            LLM_CHAT_TEMPLATE_LLAMA4            },
     { "smolvlm",           LLM_CHAT_TEMPLATE_SMOLVLM           },
+    { "hunyuan-moe",       LLM_CHAT_TEMPLATE_HUNYUAN_MOE       },
+    { "kimi-k2",           LLM_CHAT_TEMPLATE_KIMI_K2           },
 };
 llm_chat_template llm_chat_template_from_str(const std::string & name) {
@@ -166,10 +169,13 @@ llm_chat_template llm_chat_detect_template(const std::string & tmpl) {
     } else if (tmpl_contains(LU8("<｜Assistant｜>")) && tmpl_contains(LU8("<｜User｜>")) && tmpl_contains(LU8("<｜end▁of▁sentence｜>"))) {
         return LLM_CHAT_TEMPLATE_DEEPSEEK_3;
     } else if (tmpl_contains("[|system|]") && tmpl_contains("[|assistant|]") && tmpl_contains("[|endofturn|]")) {
+        if (tmpl_contains("[|tool|]")) {
+            return LLM_CHAT_TEMPLATE_EXAONE_4;
+        }
         // ref: https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct/discussions/8#66bae61b1893d14ee8ed85bb
         // EXAONE-3.0-7.8B-Instruct
         return LLM_CHAT_TEMPLATE_EXAONE_3;
-    } else if (tmpl_contains("rwkv-world")) {
+    } else if (tmpl_contains("rwkv-world") || tmpl_contains("{{- 'User: ' + message['content']|trim + '\\n\\n' -}}")) {
         return LLM_CHAT_TEMPLATE_RWKV_WORLD;
     } else if (tmpl_contains("<|start_of_role|>")) {
         return LLM_CHAT_TEMPLATE_GRANITE;
@@ -185,6 +191,10 @@ llm_chat_template llm_chat_detect_template(const std::string & tmpl) {
         return LLM_CHAT_TEMPLATE_LLAMA4;
     } else if (tmpl_contains("<|endofuserprompt|>")) {
         return LLM_CHAT_TEMPLATE_DOTS1;
+    } else if (tmpl_contains("<|startoftext|>") && tmpl_contains("<|extra_4|>")) {
+        return LLM_CHAT_TEMPLATE_HUNYUAN_MOE;
+    } else if (tmpl_contains("<|im_assistant|>assistant<|im_middle|>")) {
+        return LLM_CHAT_TEMPLATE_KIMI_K2;
     }
     return LLM_CHAT_TEMPLATE_UNKNOWN;
 }
@@ -526,6 +536,22 @@ int32_t llm_chat_apply_template(
         if (add_ass) {
             ss << "[|assistant|]";
         }
+    } else if (tmpl == LLM_CHAT_TEMPLATE_EXAONE_4) {
+        for (auto message : chat) {
+            std::string role(message->role);
+            if (role == "system") {
+                ss << "[|system|]" << trim(message->content) << "[|endofturn|]\n";
+            } else if (role == "user") {
+                ss << "[|user|]" << trim(message->content) << "\n";
+            } else if (role == "assistant") {
+                ss << "[|assistant|]" << trim(message->content) << "[|endofturn|]\n";
+            } else if (role == "tool") {
+                ss << "[|tool|]" << trim(message->content) << "[|endofturn|]\n";
+            }
+        }
+        if (add_ass) {
+            ss << "[|assistant|]";
+        }
     } else if (tmpl == LLM_CHAT_TEMPLATE_RWKV_WORLD) {
         // this template requires the model to have "\n\n" as EOT token
         for (size_t i = 0; i < chat.size(); i++) {
@@ -665,6 +691,38 @@ int32_t llm_chat_apply_template(
         if (add_ass) {
             ss << "<|response|>";
         }
+    } else if (tmpl == LLM_CHAT_TEMPLATE_HUNYUAN_MOE) {
+        // tencent/Hunyuan-A13B-Instruct
+        for (auto message : chat) {
+            std::string role(message->role);
+            if (role == "system") {
+                ss << "<|startoftext|>" << message->content << "<|extra_4|>";
+            } else if (role == "assistant") {
+                ss << "<|startoftext|>" << message->content << "<|eos|>";
+            } else {
+                ss << "<|startoftext|>" << message->content << "<|extra_0|>";
+            }
+        }
+    } else if (tmpl == LLM_CHAT_TEMPLATE_KIMI_K2) {
+        // moonshotai/Kimi-K2-Instruct
+        for (auto message : chat) {
+            std::string role(message->role);
+            if (role == "system") {
+                ss << "<|im_system|>system<|im_middle|>";
+            } else if (role == "user") {
+                ss << "<|im_user|>user<|im_middle|>";
+            } else if (role == "assistant") {
+                ss << "<|im_assistant|>assistant<|im_middle|>";
+            } else if (role == "tool") {
+                ss << "<|im_system|>tool<|im_middle|>";
+            }
+            ss << message->content << "<|im_end|>";
+            if (add_ass) {
+                ss << "<|im_assistant|>assistant<|im_middle|>";
+            }
+        }
     } else {
         // template not supported
         return -1;

package/src/llama.cpp/src/llama-chat.h CHANGED Viewed

@@ -35,6 +35,7 @@ enum llm_chat_template {
     LLM_CHAT_TEMPLATE_GLMEDGE,
     LLM_CHAT_TEMPLATE_MINICPM,
     LLM_CHAT_TEMPLATE_EXAONE_3,
+    LLM_CHAT_TEMPLATE_EXAONE_4,
     LLM_CHAT_TEMPLATE_RWKV_WORLD,
     LLM_CHAT_TEMPLATE_GRANITE,
     LLM_CHAT_TEMPLATE_GIGACHAT,
@@ -44,6 +45,8 @@ enum llm_chat_template {
     LLM_CHAT_TEMPLATE_LLAMA4,
     LLM_CHAT_TEMPLATE_SMOLVLM,
     LLM_CHAT_TEMPLATE_DOTS1,
+    LLM_CHAT_TEMPLATE_HUNYUAN_MOE,
+    LLM_CHAT_TEMPLATE_KIMI_K2,
     LLM_CHAT_TEMPLATE_UNKNOWN,
 };