npm - cui-llama.rn - Versions diffs - 1.3.3 → 1.3.5 - Mend

cui-llama.rn 1.3.3 → 1.3.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (54) hide show

package/android/src/main/CMakeLists.txt +5 -7
package/android/src/main/java/com/rnllama/LlamaContext.java +4 -4
package/android/src/main/jni.cpp +9 -9
package/cpp/common.cpp +28 -44
package/cpp/common.h +35 -14
package/cpp/ggml-alloc.c +0 -1
package/cpp/ggml-backend-impl.h +38 -20
package/cpp/ggml-backend-reg.cpp +246 -92
package/cpp/ggml-backend.h +1 -0
package/cpp/ggml-common.h +42 -48
package/cpp/{ggml-cpu-aarch64.c → ggml-cpu-aarch64.cpp} +642 -223
package/cpp/ggml-cpu-aarch64.h +2 -26
package/cpp/ggml-cpu-traits.cpp +36 -0
package/cpp/ggml-cpu-traits.h +38 -0
package/cpp/ggml-cpu.c +14122 -13971
package/cpp/ggml-cpu.cpp +627 -715
package/cpp/ggml-cpu.h +0 -17
package/cpp/ggml-impl.h +22 -6
package/cpp/ggml-metal.m +482 -24
package/cpp/ggml-quants.c +0 -9
package/cpp/ggml-threading.h +4 -2
package/cpp/ggml.c +284 -178
package/cpp/ggml.h +73 -25
package/cpp/llama-grammar.cpp +15 -15
package/cpp/llama-grammar.h +2 -5
package/cpp/llama-sampling.cpp +35 -90
package/cpp/llama-vocab.cpp +7 -2
package/cpp/llama-vocab.h +1 -1
package/cpp/llama.cpp +1782 -586
package/cpp/llama.h +20 -19
package/cpp/sampling.cpp +11 -16
package/cpp/sgemm.cpp +265 -258
package/cpp/sgemm.h +2 -2
package/cpp/speculative.cpp +4 -0
package/cpp/unicode.cpp +51 -51
package/cpp/unicode.h +9 -10
package/lib/commonjs/index.js +38 -1
package/lib/commonjs/index.js.map +1 -1
package/lib/module/index.js +36 -0
package/lib/module/index.js.map +1 -1
package/lib/typescript/NativeRNLlama.d.ts +2 -3
package/lib/typescript/NativeRNLlama.d.ts.map +1 -1
package/lib/typescript/index.d.ts +36 -2
package/lib/typescript/index.d.ts.map +1 -1
package/package.json +1 -1
package/src/NativeRNLlama.ts +3 -3
package/src/index.ts +46 -2
package/cpp/amx/amx.cpp +0 -196
package/cpp/amx/amx.h +0 -20
package/cpp/amx/common.h +0 -101
package/cpp/amx/mmq.cpp +0 -2524
package/cpp/amx/mmq.h +0 -16
package/cpp/ggml-aarch64.c +0 -129
package/cpp/ggml-aarch64.h +0 -19

package/cpp/ggml.h CHANGED Viewed

@@ -238,7 +238,9 @@
 #define LM_GGML_EXIT_SUCCESS 0
 #define LM_GGML_EXIT_ABORTED 1
-#define LM_GGML_ROPE_TYPE_NEOX 2
+#define LM_GGML_ROPE_TYPE_NEOX   2
+#define LM_GGML_ROPE_TYPE_MROPE  8
+#define LM_GGML_ROPE_TYPE_VISION 24
 #define LM_GGUF_MAGIC "GGUF"
@@ -385,15 +387,15 @@ extern "C" {
         LM_GGML_TYPE_F64     = 28,
         LM_GGML_TYPE_IQ1_M   = 29,
         LM_GGML_TYPE_BF16    = 30,
-        LM_GGML_TYPE_Q4_0_4_4 = 31,
-        LM_GGML_TYPE_Q4_0_4_8 = 32,
-        LM_GGML_TYPE_Q4_0_8_8 = 33,
+        // LM_GGML_TYPE_Q4_0_4_4 = 31, support has been removed from gguf files
+        // LM_GGML_TYPE_Q4_0_4_8 = 32,
+        // LM_GGML_TYPE_Q4_0_8_8 = 33,
         LM_GGML_TYPE_TQ1_0   = 34,
         LM_GGML_TYPE_TQ2_0   = 35,
-        LM_GGML_TYPE_IQ4_NL_4_4 = 36,
+        // LM_GGML_TYPE_IQ4_NL_4_4 = 36,
         // LM_GGML_TYPE_IQ4_NL_4_8 = 37,
         // LM_GGML_TYPE_IQ4_NL_8_8 = 38,
-        LM_GGML_TYPE_COUNT,
+        LM_GGML_TYPE_COUNT   = 39,
     };
     // precision
@@ -434,9 +436,6 @@ extern "C" {
         LM_GGML_FTYPE_MOSTLY_IQ4_XS  = 22, // except 1d tensors
         LM_GGML_FTYPE_MOSTLY_IQ1_M   = 23, // except 1d tensors
         LM_GGML_FTYPE_MOSTLY_BF16    = 24, // except 1d tensors
-        LM_GGML_FTYPE_MOSTLY_Q4_0_4_4 = 25, // except 1d tensors
-        LM_GGML_FTYPE_MOSTLY_Q4_0_4_8 = 26, // except 1d tensors
-        LM_GGML_FTYPE_MOSTLY_Q4_0_8_8 = 27, // except 1d tensors
     };
     // available tensor operations:
@@ -500,6 +499,7 @@ extern "C" {
         LM_GGML_OP_POOL_2D_BACK,
         LM_GGML_OP_UPSCALE, // nearest interpolate
         LM_GGML_OP_PAD,
+        LM_GGML_OP_PAD_REFLECT_1D,
         LM_GGML_OP_ARANGE,
         LM_GGML_OP_TIMESTEP_EMBEDDING,
         LM_GGML_OP_ARGSORT,
@@ -1446,6 +1446,22 @@ extern "C" {
             float                 beta_fast,
             float                 beta_slow);
+    LM_GGML_API struct lm_ggml_tensor * lm_ggml_rope_multi(
+            struct lm_ggml_context * ctx,
+            struct lm_ggml_tensor  * a,
+            struct lm_ggml_tensor  * b,
+            struct lm_ggml_tensor  * c,
+            int                   n_dims,
+            int                   sections[4],
+            int                   mode,
+            int                   n_ctx_orig,
+            float                 freq_base,
+            float                 freq_scale,
+            float                 ext_factor,
+            float                 attn_factor,
+            float                 beta_fast,
+            float                 beta_slow);
     // in-place, returns view(a)
     LM_GGML_API struct lm_ggml_tensor * lm_ggml_rope_ext_inplace(
             struct lm_ggml_context * ctx,
@@ -1549,17 +1565,6 @@ extern "C" {
         int                   d1, // dilation dimension 1
         bool                  is_2D);
-    LM_GGML_API struct lm_ggml_tensor * lm_ggml_conv_depthwise_2d(
-            struct lm_ggml_context * ctx,
-            struct lm_ggml_tensor  * a,  // convolution kernel
-            struct lm_ggml_tensor  * b,  // data
-            int                  s0,  // stride dimension 0
-            int                  s1,  // stride dimension 1
-            int                  p0,  // padding dimension 0
-            int                  p1,  // padding dimension 1
-            int                  d0,  // dilation dimension 0
-            int                  d1); // dilation dimension 1
     LM_GGML_API struct lm_ggml_tensor * lm_ggml_conv_1d(
             struct lm_ggml_context * ctx,
             struct lm_ggml_tensor  * a,   // convolution kernel
@@ -1577,6 +1582,23 @@ extern "C" {
             int                   s,  // stride
             int                   d); // dilation
+    // depthwise
+    // TODO: this is very likely wrong for some cases! - needs more testing
+    LM_GGML_API struct lm_ggml_tensor * lm_ggml_conv_1d_dw(
+            struct lm_ggml_context * ctx,
+            struct lm_ggml_tensor  * a,   // convolution kernel
+            struct lm_ggml_tensor  * b,   // data
+            int                   s0,  // stride
+            int                   p0,  // padding
+            int                   d0); // dilation
+    LM_GGML_API struct lm_ggml_tensor * lm_ggml_conv_1d_dw_ph(
+            struct lm_ggml_context * ctx,
+            struct lm_ggml_tensor  * a,   // convolution kernel
+            struct lm_ggml_tensor  * b,   // data
+            int                   s0,  // stride
+            int                   d0); // dilation
     LM_GGML_API struct lm_ggml_tensor * lm_ggml_conv_transpose_1d(
             struct lm_ggml_context * ctx,
             struct lm_ggml_tensor  * a,   // convolution kernel
@@ -1596,7 +1618,6 @@ extern "C" {
             int                   d0,  // dilation dimension 0
             int                   d1); // dilation dimension 1
     // kernel size is a->ne[0] x a->ne[1]
     // stride is equal to kernel size
     // padding is zero
@@ -1623,6 +1644,18 @@ extern "C" {
             struct lm_ggml_tensor  * a,
             struct lm_ggml_tensor  * b);
+    // depthwise
+    LM_GGML_API struct lm_ggml_tensor * lm_ggml_conv_2d_dw(
+            struct lm_ggml_context * ctx,
+            struct lm_ggml_tensor  * a,  // convolution kernel
+            struct lm_ggml_tensor  * b,  // data
+            int                  s0,  // stride dimension 0
+            int                  s1,  // stride dimension 1
+            int                  p0,  // padding dimension 0
+            int                  p1,  // padding dimension 1
+            int                  d0,  // dilation dimension 0
+            int                  d1); // dilation dimension 1
     LM_GGML_API struct lm_ggml_tensor * lm_ggml_conv_transpose_2d_p0(
             struct lm_ggml_context * ctx,
             struct lm_ggml_tensor  * a,
@@ -1696,6 +1729,13 @@ extern "C" {
             int                  p2,
             int                  p3);
+    // pad each dimension with reflection: [a, b, c, d] -> [b, a, b, c, d, c]
+    LM_GGML_API struct lm_ggml_tensor * lm_ggml_pad_reflect_1d(
+            struct lm_ggml_context * ctx,
+            struct lm_ggml_tensor  * a,
+            int                   p0,
+            int                   p1);
     // Ref: https://github.com/CompVis/stable-diffusion/blob/main/ldm/modules/diffusionmodules/util.py#L151
     // timesteps: [N,]
     // return: [N, dim]
@@ -2198,11 +2238,19 @@ extern "C" {
     LM_GGML_API size_t lm_gguf_get_meta_size(const struct lm_gguf_context * ctx);
     LM_GGML_API void   lm_gguf_get_meta_data(const struct lm_gguf_context * ctx, void * data);
-#ifdef  __cplusplus
-// restrict not standard in C++
-#define LM_GGML_RESTRICT
+#ifdef __cplusplus
+    // restrict not standard in C++
+#    if defined(__GNUC__)
+#        define LM_GGML_RESTRICT __restrict__
+#    elif defined(__clang__)
+#        define LM_GGML_RESTRICT __restrict
+#    elif defined(_MSC_VER)
+#        define LM_GGML_RESTRICT __restrict
+#    else
+#        define LM_GGML_RESTRICT
+#    endif
 #else
-#define LM_GGML_RESTRICT restrict
+#    define LM_GGML_RESTRICT restrict
 #endif
     typedef void (*lm_ggml_to_float_t)  (const void  * LM_GGML_RESTRICT x, float * LM_GGML_RESTRICT y, int64_t k);
     typedef void (*lm_ggml_from_float_t)(const float * LM_GGML_RESTRICT x, void  * LM_GGML_RESTRICT y, int64_t k);

package/cpp/llama-grammar.cpp CHANGED Viewed

@@ -822,15 +822,11 @@ llama_grammar_stacks & llama_grammar_get_stacks(struct llama_grammar * grammar)
     return grammar->stacks;
 }
-void llama_grammar_accept(
-        const llama_grammar_rules  & rules,
-        const llama_grammar_stacks & stacks,
-        const uint32_t               chr,
-              llama_grammar_stacks & stacks_new) {
-    stacks_new.clear();
-    stacks_new.reserve(stacks.size());
+void llama_grammar_accept(struct llama_grammar * grammar, uint32_t chr) {
+    llama_grammar_stacks stacks_new;
+    stacks_new.reserve(grammar->stacks.size());
-    for (const auto & stack : stacks) {
+    for (const auto & stack : grammar->stacks) {
         if (stack.empty()) {
             continue;
         }
@@ -844,9 +840,11 @@ void llama_grammar_accept(
             if (!llama_grammar_is_end_of_sequence(pos)) {
                 new_stack.push_back(pos);
             }
-            llama_grammar_advance_stack(rules, new_stack, stacks_new);
+            llama_grammar_advance_stack(grammar->rules, new_stack, stacks_new);
         }
     }
+    grammar->stacks = std::move(stacks_new);
 }
 llama_grammar_candidates llama_grammar_reject_candidates_for_stack(
@@ -1051,7 +1049,12 @@ void llama_grammar_free_impl(struct llama_grammar * grammar) {
 }
 struct llama_grammar * llama_grammar_clone_impl(const struct llama_grammar & grammar) {
-    llama_grammar * result = new llama_grammar { grammar.vocab, grammar.rules, grammar.stacks, grammar.partial_utf8, };
+    llama_grammar * result = new llama_grammar {
+        grammar.vocab,
+        grammar.rules,
+        grammar.stacks,
+        grammar.partial_utf8,
+    };
     // redirect elements in stacks to point to new rules
     for (size_t is = 0; is < result->stacks.size(); is++) {
@@ -1059,7 +1062,7 @@ struct llama_grammar * llama_grammar_clone_impl(const struct llama_grammar & gra
             for (size_t ir0 = 0; ir0 < grammar.rules.size(); ir0++) {
                 for (size_t ir1 = 0; ir1 < grammar.rules[ir0].size(); ir1++) {
                     if (grammar.stacks[is][ie] == &grammar.rules[ir0][ir1]) {
-                         result->stacks[is][ie]  =  &result->rules[ir0][ir1];
+                        result->stacks[is][ie] =  &result->rules[ir0][ir1];
                     }
                 }
             }
@@ -1126,11 +1129,8 @@ void llama_grammar_accept_impl(struct llama_grammar & grammar, llama_token token
     const auto   decoded     = decode_utf8(piece, grammar.partial_utf8);
     const auto & code_points = decoded.first;
-    llama_grammar_stacks stacks_new;
     for (auto it = code_points.begin(), end = code_points.end() - 1; it != end; ++it) {
-        llama_grammar_accept(grammar.rules, grammar.stacks, *it, stacks_new);
-        grammar.stacks = std::move(stacks_new);
+        llama_grammar_accept(&grammar, *it);
     }
     grammar.partial_utf8 = decoded.second;

package/cpp/llama-grammar.h CHANGED Viewed

@@ -58,6 +58,7 @@ using llama_grammar_rules      = std::vector<llama_grammar_rule>;
 using llama_grammar_stacks     = std::vector<llama_grammar_stack>;
 using llama_grammar_candidates = std::vector<llama_grammar_candidate>;
+// TODO: remove, needed for tests atm
 const llama_grammar_rules  & llama_grammar_get_rules (const struct llama_grammar * grammar);
       llama_grammar_stacks & llama_grammar_get_stacks(      struct llama_grammar * grammar);
@@ -65,11 +66,7 @@ const llama_grammar_rules  & llama_grammar_get_rules (const struct llama_grammar
 // be positioned at a character range (see `llama_grammar_advance_stack`), and
 // produces the N possible stacks if the given char is accepted at those
 // positions
-void llama_grammar_accept(
-        const llama_grammar_rules  & rules,
-        const llama_grammar_stacks & stacks,
-                          uint32_t   chr,
-              llama_grammar_stacks & stacks_new);
+void llama_grammar_accept(struct llama_grammar * grammar, uint32_t chr);
 std::vector<llama_grammar_candidate> llama_grammar_reject_candidates_for_stack(
         const llama_grammar_rules      & rules,

package/cpp/llama-sampling.cpp CHANGED Viewed

@@ -1397,19 +1397,15 @@ struct llama_sampler * llama_sampler_init_grammar_impl(const struct llama_vocab
 // penalties
 struct llama_sampler_penalties {
-    const int32_t     n_vocab;
-    const llama_token special_eos_id;
-    const llama_token linefeed_id;
     const int32_t penalty_last_n;
     const float   penalty_repeat;
     const float   penalty_freq;
     const float   penalty_present;
-    const bool    penalize_nl;
-    const bool    ignore_eos;
     ring_buffer<llama_token> prev;
+    // a frequency map to count token occurrences
+    std::unordered_map<llama_token, int> token_count;
 };
 static const char * llama_sampler_penalties_name(const struct llama_sampler * /*smpl*/) {
@@ -1422,76 +1418,50 @@ static void llama_sampler_penalties_accept(struct llama_sampler * smpl, llama_to
         return;
     }
-    ctx->prev.push_back(token);
-}
-static void llama_sampler_penalties_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
-    auto * ctx = (llama_sampler_penalties *) smpl->ctx;
+    ctx->token_count[token]++;
-    if (ctx->ignore_eos) {
-        assert(ctx->special_eos_id >= 0);
+    // if the ring buffer is full, remove the oldest token
+    if (ctx->prev.size() >= (size_t) ctx->penalty_last_n) {
+        const auto old = ctx->prev.front();
-        // optimistically check if the candidates are not yet sorted/shuffled/truncated
-        if (cur_p->size > (size_t) ctx->special_eos_id && cur_p->data[ctx->special_eos_id].id == ctx->special_eos_id) {
-            cur_p->data[ctx->special_eos_id].logit = -INFINITY;
-        } else {
-            // else, search for the special EOS token
-            for (size_t i = 0; i < cur_p->size; ++i) {
-                if (cur_p->data[i].id == ctx->special_eos_id) {
-                    cur_p->data[i].logit = -INFINITY;
-                    break;
-                }
-            }
+        ctx->token_count[old]--;
+        if (ctx->token_count[old] == 0) {
+            ctx->token_count.erase(old);
         }
     }
-    if ((ctx->penalty_last_n == 0) ||
-        (ctx->penalty_repeat == 1.0f && ctx->penalty_freq == 0.0f && ctx->penalty_present == 0.0f)) {
-        return;
-    }
-    bool nl_found = false;
-    size_t nl_idx = 0;
-    float nl_logit = -INFINITY;
-    if (!ctx->penalize_nl) {
-        assert(ctx->linefeed_id >= 0);
+    ctx->prev.push_back(token);
-        // optimistically check if the candidates are not yet sorted/shuffled/truncated
-        if (cur_p->size > (size_t) ctx->linefeed_id && cur_p->data[ctx->linefeed_id].id == ctx->linefeed_id) {
-            nl_found = true;
-            nl_idx = ctx->linefeed_id;
-            nl_logit = cur_p->data[ctx->linefeed_id].logit;
-        } else {
-            // else, search for the linefeed token
-            for (size_t i = 0; i < cur_p->size; ++i) {
-                if (cur_p->data[i].id == ctx->linefeed_id) {
-                    nl_found = true;
-                    nl_idx = i;
-                    nl_logit = cur_p->data[i].logit;
-                    break;
-                }
-            }
-        }
+#if 0
+    // sanity check
+    std::unordered_map<llama_token, int> tmp;
+    for (int i = 0; i < std::min<int>(ctx->penalty_last_n, ctx->prev.size()); ++i) {
+        tmp[ctx->prev.rat(i)]++;
     }
-    // Create a frequency map to count occurrences of each token in last_tokens
-    // TODO: optimize this by maintaining the token count in the sampler context
-    using llama_token_cnt = std::unordered_map<llama_token, int>;
-    llama_token_cnt token_count;
+    assert(ctx->token_count == tmp);
+#endif
+}
+static void llama_sampler_penalties_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
+    auto * ctx = (llama_sampler_penalties *) smpl->ctx;
-    for (int i = 0; i < std::min<int>(ctx->penalty_last_n, ctx->prev.size()); ++i) {
-        token_count[ctx->prev.rat(i)]++;
+    if ((ctx->penalty_last_n == 0) ||
+        (ctx->penalty_repeat == 1.0f && ctx->penalty_freq == 0.0f && ctx->penalty_present == 0.0f)) {
+        return;
     }
     // Apply frequency and presence penalties to the cur_p
     for (size_t i = 0; i < cur_p->size; ++i) {
-        const auto token_iter = token_count.find(cur_p->data[i].id);
-        if (token_iter == token_count.end()) {
+        const auto token_iter = ctx->token_count.find(cur_p->data[i].id);
+        if (token_iter == ctx->token_count.end()) {
             continue;
         }
         const int count = token_iter->second;
+        assert(count > 0 && count <= ctx->penalty_last_n);
         // The academic publication that described this technique actually just only divided, but that would cause tokens with negative logits to become more likely, which is obviously wrong.
         // This is common fix for this problem, which is to multiply by the penalty instead of dividing.
         if (cur_p->data[i].logit <= 0) {
@@ -1504,30 +1474,21 @@ static void llama_sampler_penalties_apply(struct llama_sampler * smpl, llama_tok
     }
     cur_p->sorted = false;
-    if (!ctx->penalize_nl && nl_found) {
-        // restore the logit of the newline token if it was penalized
-        cur_p->data[nl_idx].logit = nl_logit;
-    }
 }
 static void llama_sampler_penalties_reset(struct llama_sampler * smpl) {
     auto * ctx = (llama_sampler_penalties *) smpl->ctx;
     ctx->prev.clear();
+    ctx->token_count.clear();
 }
 static struct llama_sampler * llama_sampler_penalties_clone(const struct llama_sampler * smpl) {
     const auto * ctx = (const llama_sampler_penalties *) smpl->ctx;
     auto * result = llama_sampler_init_penalties(
-            ctx->n_vocab,
-            ctx->special_eos_id,
-            ctx->linefeed_id,
             ctx->penalty_last_n,
             ctx->penalty_repeat,
             ctx->penalty_freq,
-            ctx->penalty_present,
-            ctx->penalize_nl,
-            ctx->ignore_eos);
+            ctx->penalty_present);
     // copy the state
     {
@@ -1553,38 +1514,21 @@ static struct llama_sampler_i llama_sampler_penalties_i = {
 };
 struct llama_sampler * llama_sampler_init_penalties(
-        int32_t n_vocab,
-        llama_token special_eos_id,
-        llama_token linefeed_id,
         int32_t penalty_last_n,
         float penalty_repeat,
         float penalty_freq,
-        float penalty_present,
-        bool penalize_nl,
-        bool ignore_eos) {
-    if (linefeed_id == LLAMA_TOKEN_NULL) {
-        penalize_nl = true;
-    }
-    if (special_eos_id == LLAMA_TOKEN_NULL) {
-        ignore_eos = false;
-    }
+        float penalty_present) {
     penalty_last_n = std::max(penalty_last_n, 0);
     return new llama_sampler {
         /* .iface = */ &llama_sampler_penalties_i,
         /* .ctx   = */ new llama_sampler_penalties {
-            /* .n_vocab         = */ n_vocab,
-            /* .special_eos_id  = */ special_eos_id,
-            /* .linefeed_id     = */ linefeed_id,
             /* .penalty_last_n  = */ penalty_last_n,
             /* .penalty_repeat  = */ penalty_repeat,
             /* .penalty_freq    = */ penalty_freq,
             /* .penalty_present = */ penalty_present,
-            /* .penalize_nl     = */ penalize_nl,
-            /* .ignore_eos      = */ ignore_eos,
             /* .prev            = */ ring_buffer<llama_token>(penalty_last_n),
+            /* .token_count     = */ {},
         },
     };
 }
@@ -1612,7 +1556,8 @@ static void get_overlapping_token_sequences(const llama_vocab & vocab, const std
         if (word.find(str) != std::string::npos) {
             token_sequences.emplace(token_id, std::vector<llama_token>());
         } else {
-            size_t word_len = word.size(), str_len = str.size();
+            size_t word_len = word.size();
+            size_t str_len = str.size();
             size_t pos = -1;
             while ((pos = word.find(str[0], pos + 1)) != std::string::npos) {
                 bool match = true;

package/cpp/llama-vocab.cpp CHANGED Viewed

@@ -418,6 +418,7 @@ struct llm_tokenizer_bpe : llm_tokenizer {
             case LLAMA_VOCAB_PRE_TYPE_SMOLLM:
             case LLAMA_VOCAB_PRE_TYPE_CODESHELL:
             case LLAMA_VOCAB_PRE_TYPE_EXAONE:
+            case LLAMA_VOCAB_PRE_TYPE_MINERVA:
                 regex_exprs = {
                     "\\p{N}",
                     "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
@@ -737,7 +738,7 @@ struct llm_tokenizer_wpm_session {
         std::vector<std::string> words(1, "");
         for (const uint32_t cpt : cpts_nfd) {
-            const auto flags = unicode_cpt_flags(cpt);
+            const auto flags = unicode_cpt_flags_from_cpt(cpt);
             if (flags.is_whitespace) {
                 if (words.back().size()) {  // finish previous word if any
@@ -1656,7 +1657,7 @@ bool llama_token_is_control_impl(const struct llama_vocab & vocab, llama_token t
 }
 llama_token llama_token_bos_impl(const struct llama_vocab & vocab) {
-    return vocab.special_bos_id;
+    return vocab.type != LLAMA_VOCAB_TYPE_WPM ? vocab.special_bos_id : vocab.special_cls_id;
 }
 llama_token llama_token_eos_impl(const struct llama_vocab & vocab) {
@@ -1866,6 +1867,10 @@ int32_t llama_detokenize_impl(
                          int32_t   text_len_max,
                             bool   remove_special,
                             bool   unparse_special) {
+    if (vocab.type == LLAMA_VOCAB_TYPE_NONE) {
+        return 0;
+    }
     LM_GGML_ASSERT(vocab.tokenizer && "Tokenizer not initialized. Call llama_vocab::init_tokenizer() first.");
     int32_t avail = text_len_max;

package/cpp/llama-vocab.h CHANGED Viewed

@@ -45,7 +45,7 @@ struct llama_vocab {
     id special_unk_id  = 0;
     id special_sep_id  = LLAMA_TOKEN_NULL;
     id special_pad_id  = LLAMA_TOKEN_NULL;
-    id special_cls_id  = LLAMA_TOKEN_NULL;
+    id special_cls_id  = LLAMA_TOKEN_NULL; // TODO: revisit if this is really needed https://github.com/ggerganov/llama.cpp/pull/10930
     id special_mask_id = LLAMA_TOKEN_NULL;
     id linefeed_id = 13;