npm - cui-llama.rn - Versions diffs - 1.2.2 → 1.2.4 - Mend

cui-llama.rn 1.2.2 → 1.2.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (22) hide show

package/android/src/main/java/com/rnllama/LlamaContext.java +5 -2
package/android/src/main/jni.cpp +7 -7
package/cpp/common.cpp +81 -63
package/cpp/common.h +79 -62
package/cpp/ggml-alloc.c +17 -19
package/cpp/ggml-backend.cpp +59 -24
package/cpp/ggml-impl.h +8 -0
package/cpp/ggml.c +65 -23
package/cpp/ggml.h +1 -0
package/cpp/json-schema-to-grammar.cpp +1 -1
package/cpp/llama-sampling.cpp +366 -24
package/cpp/llama-sampling.h +3 -2
package/cpp/llama-vocab.cpp +33 -9
package/cpp/llama-vocab.h +30 -11
package/cpp/llama.cpp +471 -387
package/cpp/llama.h +52 -21
package/cpp/log.cpp +50 -50
package/cpp/log.h +18 -18
package/cpp/rn-llama.hpp +23 -22
package/cpp/sampling.cpp +110 -119
package/cpp/sampling.h +20 -20
package/package.json +1 -1

package/cpp/llama-sampling.cpp CHANGED Viewed

@@ -63,6 +63,30 @@ static void llama_log_softmax(float * array, size_t size) {
 }
 */
+static void llama_sampler_temp_impl(llama_token_data_array * cur_p, float temp) {
+    if (temp <= 0.0f) {
+        // find the token with the highest logit and set the rest to -inf
+        size_t max_i = 0;
+        float  max_l = cur_p->data[0].logit;
+        for (size_t i = 1; i < cur_p->size; ++i) {
+            if (cur_p->data[i    ].logit > max_l) {
+                cur_p->data[max_i].logit = -INFINITY;
+                max_i = i;
+                max_l = cur_p->data[i].logit;
+            } else {
+                cur_p->data[i].logit = -INFINITY;
+            }
+        }
+        return;
+    }
+    for (size_t i = 0; i < cur_p->size; ++i) {
+        cur_p->data[i].logit /= temp;
+    }
+}
 static void llama_sampler_softmax_impl(llama_token_data_array * cur_p) {
     LM_GGML_ASSERT(cur_p->size > 0);
@@ -428,6 +452,9 @@ static const char * llama_sampler_dist_name(const struct llama_sampler * /*smpl*
 static void llama_sampler_dist_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
     auto * ctx = (llama_sampler_dist *) smpl->ctx;
+    llama_sampler_softmax_impl(cur_p);
     cur_p->selected = llama_sample_dist(cur_p, ctx->rng);
 }
@@ -709,6 +736,7 @@ struct llama_sampler * llama_sampler_init_min_p(float p, size_t min_keep) {
 // xtc
+/*
 struct llama_sampler_xtc {
     const uint32_t seed;
     std::mt19937 rng;
@@ -717,7 +745,7 @@ struct llama_sampler_xtc {
     const size_t min_keep;
 };
-static const char * llama_sampler_xtc_name(const struct llama_sampler * /* smpl */) {
+static const char * llama_sampler_xtc_name(const struct llama_sampler * /* smpl /) {
     return "xtc";
 }
@@ -830,27 +858,27 @@ static void llama_sampler_xtc_free(struct llama_sampler * smpl) {
 }
 static struct llama_sampler_i llama_sampler_xtc_i = {
-    /* .name   = */ llama_sampler_xtc_name,
-    /* .accept = */ nullptr,
-    /* .apply  = */ llama_sampler_xtc_apply,
-    /* .reset  = */ nullptr,
-    /* .clone  = */ llama_sampler_xtc_clone,
-    /* .free   = */ llama_sampler_xtc_free,
+    /* .name   = / llama_sampler_xtc_name,
+    /* .accept = / nullptr,
+    /* .apply  = / llama_sampler_xtc_apply,
+    /* .reset  = / nullptr,
+    /* .clone  = / llama_sampler_xtc_clone,
+    /* .free   = / llama_sampler_xtc_free,
 };
 struct llama_sampler * llama_sampler_init_xtc(float xtc_p, float xtc_t, size_t min_keep, uint32_t seed) {
     return new llama_sampler {
-        /* .iface = */ &llama_sampler_xtc_i,
-        /* .ctx   = */ new llama_sampler_xtc {
-            /* .seed        = */ seed,
-            /* .rng         = */ std::mt19937(seed),
-            /* .xtc_p       = */ xtc_p,
-            /* .xtc_t       = */ xtc_t,
-            /* .min_keep    = */ min_keep
+        /* .iface = / &llama_sampler_xtc_i,
+        /* .ctx   = / new llama_sampler_xtc {
+            /* .seed        = / seed,
+            /* .rng         = / std::mt19937(seed),
+            /* .xtc_p       = / xtc_p,
+            /* .xtc_t       = / xtc_t,
+            /* .min_keep    = / min_keep
         },
     };
 }
+*/
 // tail-free
 struct llama_sampler_tail_free {
@@ -1057,9 +1085,8 @@ static const char * llama_sampler_temp_name(const struct llama_sampler * /*smpl*
 static void llama_sampler_temp_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
     const auto * ctx = (llama_sampler_temp *) smpl->ctx;
-    for (size_t i = 0; i < cur_p->size; ++i) {
-        cur_p->data[i].logit /= ctx->temp;
-    }
+    llama_sampler_temp_impl(cur_p, ctx->temp);
 }
 static struct llama_sampler * llama_sampler_temp_clone(const struct llama_sampler * smpl) {
@@ -1106,6 +1133,7 @@ static void llama_sampler_temp_ext_apply(struct llama_sampler * smpl, llama_toke
     if (ctx->delta > 0) {
         const float min_temp = std::max(0.0f, ctx->temp - ctx->delta);
         const float max_temp = ctx->temp + ctx->delta;
         float exponent_val = ctx->exponent;
         // no need to do anything if there is only one (or zero) candidates
@@ -1143,9 +1171,7 @@ static void llama_sampler_temp_ext_apply(struct llama_sampler * smpl, llama_toke
     #endif
         // Apply the dynamically calculated temperature scaling
-        for (size_t i = 0; i < cur_p->size; ++i) {
-            cur_p->data[i].logit /= dyn_temp;
-        }
+        llama_sampler_temp_impl(cur_p, dyn_temp);
         // Re-compute softmax probabilities after scaling logits with dynamic temperature
         const double max_l_double = cur_p->data[0].logit;
@@ -1169,9 +1195,7 @@ static void llama_sampler_temp_ext_apply(struct llama_sampler * smpl, llama_toke
         }
     #endif
     } else {
-        for (size_t i = 0; i < cur_p->size; ++i) {
-            cur_p->data[i].logit /= ctx->temp;
-        }
+        llama_sampler_temp_impl(cur_p, ctx->temp);
     }
 }
@@ -1204,6 +1228,101 @@ struct llama_sampler * llama_sampler_init_temp_ext(float temp, float delta, floa
     };
 }
+// xtc
+struct llama_sampler_xtc {
+    const float    probability;
+    const float    threshold;
+    const size_t   min_keep;
+    const uint32_t seed;
+    uint32_t       seed_cur;
+    std::mt19937   rng;
+};
+static const char * llama_sampler_xtc_name(const struct llama_sampler * /*smpl*/) {
+    return "xtc";
+}
+static void llama_sample_xtc_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
+    auto * ctx = (llama_sampler_xtc *) smpl->ctx;
+    if (ctx->probability <= 0.0f
+        || ctx->threshold > 0.5f
+        || cur_p->size < 2) {
+        return;
+    }
+    std::uniform_real_distribution<float> distribution(0.0f, 1.0f);
+    float chance = distribution(ctx->rng);
+    if (chance > ctx->probability) return;
+    // in case it's not sorted/recalculated yet
+    llama_sampler_softmax_impl(cur_p);
+    int pos_last = 0;
+    for (size_t i = 0; i < cur_p->size; ++i) {
+        if (cur_p->data[i].p >= ctx->threshold) {
+            pos_last = i;
+        } else break;
+    }
+    if (cur_p->size - pos_last >= ctx->min_keep && pos_last > 0) {
+        cur_p->data += pos_last;
+        cur_p->size -= pos_last;
+    }
+}
+static struct llama_sampler * llama_sampler_xtc_clone(const struct llama_sampler * smpl) {
+    const auto * ctx = (const llama_sampler_xtc *) smpl->ctx;
+    auto * result = llama_sampler_init_xtc(ctx->probability, ctx->threshold, ctx->min_keep, ctx->seed);
+    // copy the state
+    {
+        auto * result_ctx = (llama_sampler_xtc *) result->ctx;
+        result_ctx->rng = ctx->rng;
+    }
+    return result;
+}
+static void llama_sampler_xtc_free(struct llama_sampler * smpl) {
+    delete (llama_sampler_xtc *) smpl->ctx;
+}
+static void llama_sampler_xtc_reset(struct llama_sampler * smpl) {
+    auto * ctx = (llama_sampler_xtc *) smpl->ctx;
+    ctx->seed_cur = get_rng_seed(ctx->seed);
+    ctx->rng.seed(ctx->seed_cur);
+}
+static struct llama_sampler_i llama_sampler_xtc_i = {
+    /* .name   = */ llama_sampler_xtc_name,
+    /* .accept = */ nullptr,
+    /* .apply  = */ llama_sample_xtc_apply,
+    /* .reset  = */ llama_sampler_xtc_reset,
+    /* .clone  = */ llama_sampler_xtc_clone,
+    /* .free   = */ llama_sampler_xtc_free,
+};
+struct llama_sampler * llama_sampler_init_xtc(float p, float t, size_t min_keep, uint32_t seed) {
+    auto seed_cur = get_rng_seed(seed);
+    return new llama_sampler {
+        /* .iface = */ &llama_sampler_xtc_i,
+        /* .ctx   = */ new llama_sampler_xtc {
+            /* .probability   = */ p,
+            /* .threshold     = */ t,
+            /* .min_keep      = */ min_keep,
+            /* .seed          = */ seed,
+            /* .seed_cur      = */ seed_cur,
+            /* .rng           = */ std::mt19937(seed_cur),
+        },
+    };
+}
 // mirostat
 struct llama_sampler_mirostat {
@@ -1789,6 +1908,229 @@ struct llama_sampler * llama_sampler_init_logit_bias(
     };
 }
+// infill
+//#define LM_GGML_DEBUG_SAMPLER_INFILL
+struct llama_sampler_infill {
+    const struct llama_vocab * vocab;
+    std::vector<char> buf0;
+    std::vector<char> buf1;
+};
+static const char * llama_sampler_infill_name(const struct llama_sampler * /*smpl*/) {
+    return "infill";
+}
+static void llama_sampler_infill_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
+    auto * ctx = (llama_sampler_infill *) smpl->ctx;
+    llama_sampler_softmax_impl(cur_p);
+#if defined(LM_GGML_DEBUG_SAMPLER_INFILL)
+#define LOG_DBG_CUR LLAMA_LOG_DEBUG
+#else
+#define LOG_DBG_CUR(...)
+#endif
+    for (size_t i = 0; i < cur_p->size; ++i) {
+        LOG_DBG_CUR("%s: cur_p[%3zu] = { id: %6d, p: %.6f, logit: %6.3f }\n", __func__, i, cur_p->data[i].id, cur_p->data[i].p, cur_p->data[i].logit);
+    }
+    float p_txt_sum = 0.0f;
+    float p_eog_sum = 0.0f;
+    for (size_t i = 0; i < cur_p->size; ++i) {
+        if (llama_token_is_eog_impl(*ctx->vocab, cur_p->data[i].id)) {
+            p_eog_sum += cur_p->data[i].p;
+        } else {
+            p_txt_sum += cur_p->data[i].p;
+        }
+    }
+    const float rat = p_eog_sum == 0.0 ? INFINITY : p_txt_sum / p_eog_sum; LM_GGML_UNUSED(rat);
+    LOG_DBG_CUR("%s: p_txt_sum = %.2f, p_eog_sum = %.2f, rat = %.2f, n = %zu\n", __func__, p_txt_sum, p_eog_sum, rat, cur_p->size);
+    if (3*p_eog_sum*cur_p->size > p_txt_sum) {
+        LOG_DBG_CUR("%s: the ratio p_txt/p_eog = %.2f is too low -> sampling EOG\n", __func__, p_txt_sum/p_eog_sum);
+        // keep just the EOG tokens
+        const auto size_org = cur_p->size;
+        cur_p->size = 0;
+        float p_sum = 0.0f;
+        for (size_t i = 0; i < size_org; ++i) {
+            if (llama_token_is_eog_impl(*ctx->vocab, cur_p->data[i].id)) {
+                p_sum += cur_p->data[i].p;
+                cur_p->data[cur_p->size++] = cur_p->data[i];
+            }
+        }
+        // normalize probs
+        for (size_t i = 0; i < cur_p->size; ++i) {
+            cur_p->data[i].p /= p_sum;
+        }
+        return;
+    }
+    size_t n_combined = 0; LM_GGML_UNUSED(n_combined);
+    // combine tokens with common prefix
+    for (size_t i0 = 0; i0 < cur_p->size; ++i0) {
+        for (size_t i1 = 0; i1 < cur_p->size; ++i1) {
+            if (cur_p->data[i0].logit == -INFINITY) {
+                break;
+            }
+            if (i0 == i1 || cur_p->data[i1].logit == -INFINITY) {
+                continue;
+            }
+            int len0 = llama_token_to_piece_impl(*ctx->vocab, cur_p->data[i0].id, ctx->buf0.data(), ctx->buf0.size(), 0, false);
+            if (len0 < 0) {
+                ctx->buf0.resize(len0);
+                len0 = llama_token_to_piece_impl(*ctx->vocab, cur_p->data[i0].id, ctx->buf0.data(), ctx->buf0.size(), 0, false);
+                assert(len0 > 0);
+            }
+            int len1 = llama_token_to_piece_impl(*ctx->vocab, cur_p->data[i1].id, ctx->buf1.data(), ctx->buf1.size(), 0, false);
+            if (len1 < 0) {
+                ctx->buf1.resize(len1);
+                len1 = llama_token_to_piece_impl(*ctx->vocab, cur_p->data[i1].id, ctx->buf1.data(), ctx->buf1.size(), 0, false);
+                assert(len1 > 0);
+            }
+            // token i0 is a prefix of token i1
+            if (len0 > 0 && len0 <= len1 && memcmp(ctx->buf0.data(), ctx->buf1.data(), len0) == 0) {
+                int dst = i0;
+                int src = i1;
+                // merge into the token with higher probability
+                if (cur_p->data[i1].p > cur_p->data[i0].p) {
+                    std::swap(dst, src);
+                }
+                cur_p->data[dst].p += cur_p->data[src].p;
+                cur_p->data[src].logit = -INFINITY;
+                cur_p->data[src].p     = 0.0f;
+                n_combined++;
+            }
+        }
+    }
+    size_t n_non_eog = 0;
+    size_t size_org = cur_p->size;
+    float p_sum = 0.0f;
+    float thold = 0.2f;
+    cur_p->size = 0;
+    LOG_DBG_CUR("%s: n_combined = %zu, applying thold = %.3f\n", __func__, n_combined, thold);
+    for (size_t i = 0; i < size_org; ++i) {
+        const bool is_eog = llama_token_is_eog_impl(*ctx->vocab, cur_p->data[i].id);
+        if (cur_p->data[i].p < thold && !is_eog) {
+            continue;
+        }
+        if (!is_eog) {
+            ++n_non_eog;
+        }
+        p_sum += cur_p->data[i].p;
+        // keep this token
+        cur_p->data[cur_p->size++] = cur_p->data[i];
+    }
+    LOG_DBG_CUR("%s: n_non_eog = %zu\n", __func__, n_non_eog);
+    // if no non-EOG tokens are left -> reduce cur_p to single EOT token
+    if (n_non_eog == 0) {
+        cur_p->size = 1;
+        cur_p->data[0].id = llama_token_eot_impl(*ctx->vocab);
+        cur_p->data[0].logit = 1.0f;
+        return;
+    }
+    // normalize probs
+    for (size_t i = 0; i < cur_p->size; ++i) {
+        cur_p->data[i].p /= p_sum;
+        LOG_DBG_CUR("%s: cur_p[%3zu] = { id: %6d, p: %.6f, logit: %6.3f }\n", __func__, i, cur_p->data[i].id, cur_p->data[i].p, cur_p->data[i].logit);
+    }
+    size_org = cur_p->size;
+    p_sum = 0.0f;
+    thold = 1.0/(n_non_eog + 1);
+    cur_p->size = 0;
+    LOG_DBG_CUR("%s: applying thold = %.3f\n", __func__, thold);
+    for (size_t i = 0; i < size_org; ++i) {
+        const bool is_eog = llama_token_is_eog_impl(*ctx->vocab, cur_p->data[i].id);
+        if (cur_p->data[i].p < thold && !is_eog) {
+            continue;
+        }
+        p_sum += cur_p->data[i].p;
+        cur_p->data[cur_p->size++] = cur_p->data[i];
+    }
+    // normalize probs
+    for (size_t i = 0; i < cur_p->size; ++i) {
+        cur_p->data[i].p /= p_sum;
+        LOG_DBG_CUR("%s: cur_p[%3zu] = { id: %6d, p: %.6f, logit: %6.3f }\n", __func__, i, cur_p->data[i].id, cur_p->data[i].p, cur_p->data[i].logit);
+    }
+#undef LOG_DBG_CUR
+}
+static struct llama_sampler * llama_sampler_infill_clone(const struct llama_sampler * smpl) {
+    const auto * ctx = (const llama_sampler_infill *) smpl->ctx;
+    return llama_sampler_init_infill_impl(*ctx->vocab);
+}
+static void llama_sampler_infill_free(struct llama_sampler * smpl) {
+    delete (llama_sampler_infill *) smpl->ctx;
+}
+static struct llama_sampler_i llama_sampler_infill_i = {
+    /* .name   = */ llama_sampler_infill_name,
+    /* .accept = */ nullptr,
+    /* .apply  = */ llama_sampler_infill_apply,
+    /* .reset  = */ nullptr,
+    /* .clone  = */ llama_sampler_infill_clone,
+    /* .free   = */ llama_sampler_infill_free,
+};
+struct llama_sampler * llama_sampler_init_infill_impl(
+        const struct llama_vocab & vocab) {
+    return new llama_sampler {
+        /* .iface = */ &llama_sampler_infill_i,
+        /* .ctx   = */ new llama_sampler_infill {
+            /* .vocab = */ &vocab,
+            /* .buf0 = */ std::vector<char>(512),
+            /* .buf1 = */ std::vector<char>(512),
+        },
+    };
+}
 // utils
 uint32_t llama_sampler_get_seed(const struct llama_sampler * smpl) {

package/cpp/llama-sampling.h CHANGED Viewed

@@ -4,8 +4,6 @@
 #include "llama-grammar.h"
-#include <unordered_map>
 struct llama_vocab;
 struct llama_grammar;
@@ -27,3 +25,6 @@ struct llama_sampler * llama_sampler_init_grammar_impl(
         const struct llama_vocab & vocab,
                       const char * grammar_str,
                       const char * grammar_root);
+struct llama_sampler * llama_sampler_init_infill_impl(
+        const struct llama_vocab & vocab);

package/cpp/llama-vocab.cpp CHANGED Viewed

@@ -221,7 +221,7 @@ struct llm_tokenizer_spm_session {
         }
         // seed the work queue with all possible 2-character tokens.
-        for (size_t i = 1; i < symbols.size(); ++i) {
+        for (int i = 1; i < (int) symbols.size(); ++i) {
             try_add_bigram(i - 1, i);
         }
@@ -563,7 +563,7 @@ struct llm_tokenizer_bpe_session {
                 index++;
                 symbols.emplace_back(sym);
             }
-            for (size_t i = 1; i < symbols.size(); ++i) {
+            for (int i = 1; i < (int) symbols.size(); ++i) {
                 add_new_bigram(i - 1, i);
             }
@@ -1663,6 +1663,14 @@ llama_token llama_token_eos_impl(const struct llama_vocab & vocab) {
     return vocab.special_eos_id;
 }
+llama_token llama_token_eot_impl(const struct llama_vocab & vocab) {
+    return vocab.special_eot_id;
+}
+llama_token llama_token_eom_impl(const struct llama_vocab & vocab) {
+    return vocab.special_eom_id;
+}
 llama_token llama_token_cls_impl(const struct llama_vocab & vocab) {
     return vocab.special_cls_id;
 }
@@ -1688,23 +1696,39 @@ bool llama_add_eos_token_impl(const struct llama_vocab & vocab) {
 }
 llama_token llama_token_prefix_impl(const struct llama_vocab & vocab) {
-    return vocab.special_prefix_id;
+    return vocab.special_fim_pre_id;
 }
 llama_token llama_token_middle_impl(const struct llama_vocab & vocab) {
-    return vocab.special_middle_id;
+    return vocab.special_fim_mid_id;
 }
 llama_token llama_token_suffix_impl(const struct llama_vocab & vocab) {
-    return vocab.special_suffix_id;
+    return vocab.special_fim_suf_id;
 }
-llama_token llama_token_eot_impl(const struct llama_vocab & vocab) {
-    return vocab.special_eot_id;
+llama_token llama_token_fim_pre_impl(const struct llama_vocab & vocab) {
+    return vocab.special_fim_pre_id;
 }
-llama_token llama_token_eom_impl(const struct llama_vocab & vocab) {
-    return vocab.special_eom_id;
+llama_token llama_token_fim_suf_impl(const struct llama_vocab & vocab) {
+    return vocab.special_fim_suf_id;
+}
+llama_token llama_token_fim_mid_impl(const struct llama_vocab & vocab) {
+    return vocab.special_fim_mid_id;
+}
+llama_token llama_token_fim_pad_impl(const struct llama_vocab & vocab) {
+    return vocab.special_fim_pad_id;
+}
+llama_token llama_token_fim_rep_impl(const struct llama_vocab & vocab) {
+    return vocab.special_fim_rep_id;
+}
+llama_token llama_token_fim_sep_impl(const struct llama_vocab & vocab) {
+    return vocab.special_fim_sep_id;
 }
 int32_t llama_tokenize_impl(

package/cpp/llama-vocab.h CHANGED Viewed

@@ -37,20 +37,26 @@ struct llama_vocab {
     std::map<std::pair<std::string, std::string>, int> bpe_ranks;
     // default LLaMA special tokens
+    // TODO: should we set all of these to LLAMA_TOKEN_NULL?
     id special_bos_id  = 1;
     id special_eos_id  = 2;
+    id special_eot_id  = LLAMA_TOKEN_NULL;
+    id special_eom_id  = LLAMA_TOKEN_NULL;
     id special_unk_id  = 0;
     id special_sep_id  = LLAMA_TOKEN_NULL;
     id special_pad_id  = LLAMA_TOKEN_NULL;
     id special_cls_id  = LLAMA_TOKEN_NULL;
     id special_mask_id = LLAMA_TOKEN_NULL;
-    id linefeed_id       = 13;
-    id special_prefix_id = LLAMA_TOKEN_NULL;
-    id special_suffix_id = LLAMA_TOKEN_NULL;
-    id special_middle_id = LLAMA_TOKEN_NULL;
-    id special_eot_id    = LLAMA_TOKEN_NULL; // TODO: move above after "eos_id", and here add "file separator" token
-    id special_eom_id    = LLAMA_TOKEN_NULL;
+    id linefeed_id = 13;
+    // fim tokens
+    id special_fim_pre_id = LLAMA_TOKEN_NULL;
+    id special_fim_suf_id = LLAMA_TOKEN_NULL;
+    id special_fim_mid_id = LLAMA_TOKEN_NULL;
+    id special_fim_pad_id = LLAMA_TOKEN_NULL;
+    id special_fim_rep_id = LLAMA_TOKEN_NULL; // repo
+    id special_fim_sep_id = LLAMA_TOKEN_NULL; // file separator
     // set of all tokens that cause "end of generation"
     std::set<id> special_eog_ids;
@@ -104,19 +110,26 @@ bool llama_token_is_control_impl(const struct llama_vocab & vocab, llama_token t
 llama_token llama_token_bos_impl(const struct llama_vocab & vocab);
 llama_token llama_token_eos_impl(const struct llama_vocab & vocab);
+llama_token llama_token_eot_impl(const struct llama_vocab & vocab);
+llama_token llama_token_eom_impl(const struct llama_vocab & vocab);
 llama_token llama_token_cls_impl(const struct llama_vocab & vocab);
 llama_token llama_token_sep_impl(const struct llama_vocab & vocab);
 llama_token llama_token_nl_impl (const struct llama_vocab & vocab);
 llama_token llama_token_pad_impl(const struct llama_vocab & vocab);
-bool llama_add_bos_token_impl(const struct llama_vocab & vocab);
-bool llama_add_eos_token_impl(const struct llama_vocab & vocab);
 llama_token llama_token_prefix_impl(const struct llama_vocab & vocab);
 llama_token llama_token_middle_impl(const struct llama_vocab & vocab);
 llama_token llama_token_suffix_impl(const struct llama_vocab & vocab);
-llama_token llama_token_eot_impl   (const struct llama_vocab & vocab);
-llama_token llama_token_eom_impl   (const struct llama_vocab & vocab);
+llama_token llama_token_fim_pre_impl(const struct llama_vocab & vocab);
+llama_token llama_token_fim_suf_impl(const struct llama_vocab & vocab);
+llama_token llama_token_fim_mid_impl(const struct llama_vocab & vocab);
+llama_token llama_token_fim_pad_impl(const struct llama_vocab & vocab);
+llama_token llama_token_fim_rep_impl(const struct llama_vocab & vocab);
+llama_token llama_token_fim_sep_impl(const struct llama_vocab & vocab);
+bool llama_add_bos_token_impl(const struct llama_vocab & vocab);
+bool llama_add_eos_token_impl(const struct llama_vocab & vocab);
 int32_t llama_tokenize_impl(
         const struct llama_vocab & vocab,
@@ -136,6 +149,12 @@ int32_t llama_token_to_piece_impl(
                          int32_t   lstrip,
                             bool   special);
+// check if token0 is contained as a prefix in token1
+bool llama_token_is_prefix_impl(
+        const struct llama_vocab & vocab,
+                     llama_token   token0,
+                     llama_token   token1);
 int32_t llama_detokenize_impl(
         const struct llama_vocab & vocab,
                const llama_token * tokens,