npm - @fugood/llama.node - Versions diffs - 0.3.1 → 0.3.3 - Mend

@fugood/llama.node 0.3.1 → 0.3.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (252) hide show

package/CMakeLists.txt +1 -8
package/bin/darwin/arm64/llama-node.node +0 -0
package/bin/darwin/x64/llama-node.node +0 -0
package/bin/linux/arm64/llama-node.node +0 -0
package/bin/linux/x64/llama-node.node +0 -0
package/bin/linux-vulkan/arm64/llama-node.node +0 -0
package/bin/linux-vulkan/x64/llama-node.node +0 -0
package/bin/win32/arm64/llama-node.node +0 -0
package/bin/win32/arm64/node.lib +0 -0
package/bin/win32/x64/llama-node.node +0 -0
package/bin/win32/x64/node.lib +0 -0
package/bin/win32-vulkan/arm64/llama-node.node +0 -0
package/bin/win32-vulkan/arm64/node.lib +0 -0
package/bin/win32-vulkan/x64/llama-node.node +0 -0
package/bin/win32-vulkan/x64/node.lib +0 -0
package/package.json +4 -2
package/src/DetokenizeWorker.cpp +1 -1
package/src/EmbeddingWorker.cpp +2 -2
package/src/LlamaCompletionWorker.cpp +10 -10
package/src/LlamaCompletionWorker.h +2 -2
package/src/LlamaContext.cpp +14 -17
package/src/TokenizeWorker.cpp +1 -1
package/src/common.hpp +5 -4
package/src/llama.cpp/.github/workflows/build.yml +137 -29
package/src/llama.cpp/.github/workflows/close-issue.yml +5 -0
package/src/llama.cpp/.github/workflows/docker.yml +46 -34
package/src/llama.cpp/.github/workflows/nix-ci-aarch64.yml +7 -0
package/src/llama.cpp/.github/workflows/nix-ci.yml +7 -0
package/src/llama.cpp/.github/workflows/python-check-requirements.yml +2 -4
package/src/llama.cpp/.github/workflows/python-type-check.yml +3 -1
package/src/llama.cpp/.github/workflows/server.yml +7 -0
package/src/llama.cpp/CMakeLists.txt +26 -11
package/src/llama.cpp/cmake/arm64-apple-clang.cmake +16 -0
package/src/llama.cpp/common/CMakeLists.txt +10 -10
package/src/llama.cpp/common/arg.cpp +2041 -0
package/src/llama.cpp/common/arg.h +77 -0
package/src/llama.cpp/common/common.cpp +523 -1861
package/src/llama.cpp/common/common.h +234 -106
package/src/llama.cpp/common/console.cpp +3 -0
package/src/llama.cpp/common/json-schema-to-grammar.cpp +1 -1
package/src/llama.cpp/common/log.cpp +401 -0
package/src/llama.cpp/common/log.h +66 -698
package/src/llama.cpp/common/ngram-cache.cpp +39 -36
package/src/llama.cpp/common/ngram-cache.h +19 -19
package/src/llama.cpp/common/sampling.cpp +356 -350
package/src/llama.cpp/common/sampling.h +62 -139
package/src/llama.cpp/common/stb_image.h +5990 -6398
package/src/llama.cpp/docs/build.md +72 -17
package/src/llama.cpp/examples/CMakeLists.txt +1 -2
package/src/llama.cpp/examples/batched/batched.cpp +49 -65
package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +42 -53
package/src/llama.cpp/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp +55 -52
package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +22 -22
package/src/llama.cpp/examples/cvector-generator/pca.hpp +3 -13
package/src/llama.cpp/examples/embedding/embedding.cpp +147 -91
package/src/llama.cpp/examples/eval-callback/eval-callback.cpp +37 -37
package/src/llama.cpp/examples/export-lora/export-lora.cpp +39 -38
package/src/llama.cpp/examples/gbnf-validator/gbnf-validator.cpp +14 -39
package/src/llama.cpp/examples/{baby-llama → gen-docs}/CMakeLists.txt +2 -2
package/src/llama.cpp/examples/gen-docs/gen-docs.cpp +83 -0
package/src/llama.cpp/examples/gguf-split/gguf-split.cpp +58 -39
package/src/llama.cpp/examples/gritlm/gritlm.cpp +46 -39
package/src/llama.cpp/examples/imatrix/imatrix.cpp +75 -69
package/src/llama.cpp/examples/infill/infill.cpp +131 -192
package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +276 -178
package/src/llama.cpp/examples/llama.android/llama/build.gradle.kts +1 -0
package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +40 -36
package/src/llama.cpp/examples/llava/CMakeLists.txt +7 -0
package/src/llama.cpp/examples/llava/clip.cpp +686 -150
package/src/llama.cpp/examples/llava/clip.h +11 -2
package/src/llama.cpp/examples/llava/llava-cli.cpp +60 -71
package/src/llama.cpp/examples/llava/llava.cpp +146 -26
package/src/llama.cpp/examples/llava/llava.h +2 -3
package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +323 -0
package/src/llama.cpp/examples/llava/requirements.txt +1 -0
package/src/llama.cpp/examples/lookahead/lookahead.cpp +55 -56
package/src/llama.cpp/examples/lookup/lookup-create.cpp +15 -13
package/src/llama.cpp/examples/lookup/lookup-merge.cpp +4 -4
package/src/llama.cpp/examples/lookup/lookup-stats.cpp +34 -33
package/src/llama.cpp/examples/lookup/lookup.cpp +60 -63
package/src/llama.cpp/examples/main/main.cpp +216 -313
package/src/llama.cpp/examples/parallel/parallel.cpp +58 -59
package/src/llama.cpp/examples/passkey/passkey.cpp +53 -61
package/src/llama.cpp/examples/perplexity/perplexity.cpp +277 -311
package/src/llama.cpp/examples/quantize/CMakeLists.txt +1 -1
package/src/llama.cpp/examples/quantize/quantize.cpp +27 -9
package/src/llama.cpp/examples/quantize-stats/quantize-stats.cpp +12 -12
package/src/llama.cpp/examples/retrieval/retrieval.cpp +57 -52
package/src/llama.cpp/examples/rpc/rpc-server.cpp +27 -2
package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +60 -46
package/src/llama.cpp/examples/server/CMakeLists.txt +7 -18
package/src/llama.cpp/examples/server/server.cpp +1347 -1531
package/src/llama.cpp/examples/server/tests/requirements.txt +2 -1
package/src/llama.cpp/examples/server/utils.hpp +396 -107
package/src/llama.cpp/examples/simple/CMakeLists.txt +1 -1
package/src/llama.cpp/examples/simple/simple.cpp +132 -106
package/src/llama.cpp/examples/simple-chat/CMakeLists.txt +5 -0
package/src/llama.cpp/examples/simple-chat/simple-chat.cpp +197 -0
package/src/llama.cpp/examples/speculative/speculative.cpp +153 -124
package/src/llama.cpp/examples/sycl/run-llama2.sh +10 -19
package/src/llama.cpp/examples/sycl/win-run-llama2.bat +1 -1
package/src/llama.cpp/examples/tokenize/tokenize.cpp +27 -29
package/src/llama.cpp/ggml/CMakeLists.txt +29 -12
package/src/llama.cpp/ggml/include/ggml-alloc.h +3 -3
package/src/llama.cpp/ggml/include/ggml-amx.h +25 -0
package/src/llama.cpp/ggml/include/ggml-backend.h +166 -68
package/src/llama.cpp/ggml/include/ggml-blas.h +5 -3
package/src/llama.cpp/ggml/include/ggml-cann.h +17 -19
package/src/llama.cpp/ggml/include/ggml-cpp.h +38 -0
package/src/llama.cpp/ggml/include/ggml-cpu.h +177 -0
package/src/llama.cpp/ggml/include/ggml-cuda.h +17 -17
package/src/llama.cpp/ggml/include/ggml-kompute.h +7 -3
package/src/llama.cpp/ggml/include/ggml-metal.h +13 -12
package/src/llama.cpp/ggml/include/ggml-opt.h +216 -0
package/src/llama.cpp/ggml/include/ggml-rpc.h +9 -5
package/src/llama.cpp/ggml/include/ggml-sycl.h +18 -11
package/src/llama.cpp/ggml/include/ggml-vulkan.h +10 -8
package/src/llama.cpp/ggml/include/ggml.h +272 -505
package/src/llama.cpp/ggml/src/CMakeLists.txt +69 -1110
package/src/llama.cpp/ggml/src/ggml-aarch64.c +52 -2116
package/src/llama.cpp/ggml/src/ggml-aarch64.h +0 -20
package/src/llama.cpp/ggml/src/ggml-alloc.c +29 -27
package/src/llama.cpp/ggml/src/ggml-amx/CMakeLists.txt +107 -0
package/src/llama.cpp/ggml/src/ggml-amx/common.h +94 -0
package/src/llama.cpp/ggml/src/ggml-amx/ggml-amx.cpp +446 -0
package/src/llama.cpp/ggml/src/ggml-amx/mmq.cpp +2510 -0
package/src/llama.cpp/ggml/src/ggml-amx/mmq.h +17 -0
package/src/llama.cpp/ggml/src/ggml-backend-impl.h +144 -81
package/src/llama.cpp/ggml/src/ggml-backend-reg.cpp +195 -0
package/src/llama.cpp/ggml/src/{ggml-backend.c → ggml-backend.cpp} +394 -635
package/src/llama.cpp/ggml/src/ggml-blas/CMakeLists.txt +91 -0
package/src/llama.cpp/ggml/src/{ggml-blas.cpp → ggml-blas/ggml-blas.cpp} +217 -70
package/src/llama.cpp/ggml/src/ggml-cann/CMakeLists.txt +46 -0
package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.cpp +4 -27
package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.h +32 -4
package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +179 -41
package/src/llama.cpp/ggml/src/ggml-cann/common.h +1 -0
package/src/llama.cpp/ggml/src/{ggml-cann.cpp → ggml-cann/ggml-cann.cpp} +458 -353
package/src/llama.cpp/ggml/src/ggml-cann/kernels/CMakeLists.txt +2 -1
package/src/llama.cpp/ggml/src/ggml-cann/kernels/ascendc_kernels.h +2 -0
package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +278 -0
package/src/llama.cpp/ggml/src/ggml-common.h +20 -0
package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +261 -0
package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.c +3560 -0
package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.h +30 -0
package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +371 -0
package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +10822 -0
package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.h +63 -0
package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +13970 -0
package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +663 -0
package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +1885 -0
package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +155 -0
package/src/llama.cpp/ggml/src/ggml-cuda/vendors/cuda.h +14 -0
package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +178 -0
package/src/llama.cpp/ggml/src/ggml-cuda/vendors/musa.h +134 -0
package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +106 -0
package/src/llama.cpp/ggml/src/ggml-impl.h +380 -584
package/src/llama.cpp/ggml/src/ggml-kompute/CMakeLists.txt +162 -0
package/src/llama.cpp/ggml/src/{ggml-kompute.cpp → ggml-kompute/ggml-kompute.cpp} +233 -87
package/src/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +108 -0
package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +249 -0
package/src/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +100 -0
package/src/llama.cpp/ggml/src/ggml-opt.cpp +867 -0
package/src/llama.cpp/ggml/src/ggml-quants.c +369 -9994
package/src/llama.cpp/ggml/src/ggml-quants.h +78 -110
package/src/llama.cpp/ggml/src/ggml-rpc/CMakeLists.txt +11 -0
package/src/llama.cpp/ggml/src/{ggml-rpc.cpp → ggml-rpc/ggml-rpc.cpp} +560 -335
package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +81 -0
package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +6 -0
package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +51 -0
package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +310 -0
package/src/llama.cpp/ggml/src/ggml-sycl/concat.cpp +1 -0
package/src/llama.cpp/ggml/src/ggml-sycl/conv.cpp +99 -0
package/src/llama.cpp/ggml/src/ggml-sycl/conv.hpp +21 -0
package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +57 -57
package/src/llama.cpp/ggml/src/ggml-sycl/convert.hpp +1 -1
package/src/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +106 -106
package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +4 -4
package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +18 -25
package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +1011 -0
package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +76 -0
package/src/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +101 -0
package/src/llama.cpp/ggml/src/{ggml-sycl.cpp → ggml-sycl/ggml-sycl.cpp} +3350 -3980
package/src/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +125 -0
package/src/llama.cpp/ggml/src/ggml-sycl/im2col.hpp +23 -0
package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +70 -68
package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +9 -6
package/src/llama.cpp/ggml/src/ggml-sycl/outprod.cpp +56 -0
package/src/llama.cpp/ggml/src/ggml-sycl/outprod.hpp +11 -0
package/src/llama.cpp/ggml/src/ggml-sycl/presets.hpp +8 -0
package/src/llama.cpp/ggml/src/ggml-sycl/rope.cpp +1 -1
package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.cpp +71 -0
package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.hpp +21 -0
package/src/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +4 -4
package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.cpp +138 -0
package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.hpp +10 -0
package/src/llama.cpp/ggml/src/ggml-threading.cpp +12 -0
package/src/llama.cpp/ggml/src/ggml-threading.h +12 -0
package/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +78 -0
package/src/llama.cpp/ggml/src/{ggml-vulkan.cpp → ggml-vulkan/ggml-vulkan.cpp} +2034 -1718
package/src/llama.cpp/ggml/src/{vulkan-shaders → ggml-vulkan/vulkan-shaders}/CMakeLists.txt +2 -0
package/src/llama.cpp/ggml/src/{vulkan-shaders → ggml-vulkan/vulkan-shaders}/vulkan-shaders-gen.cpp +152 -185
package/src/llama.cpp/ggml/src/ggml.c +2075 -16579
package/src/llama.cpp/include/llama.h +296 -285
package/src/llama.cpp/models/ggml-vocab-chameleon.gguf.inp +112 -0
package/src/llama.cpp/models/ggml-vocab-chameleon.gguf.out +46 -0
package/src/llama.cpp/pocs/vdot/q8dot.cpp +4 -3
package/src/llama.cpp/pocs/vdot/vdot.cpp +8 -7
package/src/llama.cpp/requirements/requirements-convert_legacy_llama.txt +1 -1
package/src/llama.cpp/src/CMakeLists.txt +2 -1
package/src/llama.cpp/src/llama-grammar.cpp +721 -122
package/src/llama.cpp/src/llama-grammar.h +120 -15
package/src/llama.cpp/src/llama-impl.h +156 -1
package/src/llama.cpp/src/llama-sampling.cpp +2058 -346
package/src/llama.cpp/src/llama-sampling.h +39 -47
package/src/llama.cpp/src/llama-vocab.cpp +390 -127
package/src/llama.cpp/src/llama-vocab.h +60 -20
package/src/llama.cpp/src/llama.cpp +6215 -3263
package/src/llama.cpp/src/unicode-data.cpp +6 -4
package/src/llama.cpp/src/unicode-data.h +4 -4
package/src/llama.cpp/src/unicode.cpp +15 -7
package/src/llama.cpp/tests/CMakeLists.txt +4 -2
package/src/llama.cpp/tests/test-arg-parser.cpp +131 -0
package/src/llama.cpp/tests/test-backend-ops.cpp +1725 -297
package/src/llama.cpp/tests/test-barrier.cpp +94 -0
package/src/llama.cpp/tests/test-chat-template.cpp +9 -5
package/src/llama.cpp/tests/test-grammar-integration.cpp +23 -38
package/src/llama.cpp/tests/test-grammar-parser.cpp +6 -4
package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +23 -8
package/src/llama.cpp/tests/test-llama-grammar.cpp +9 -8
package/src/llama.cpp/tests/test-log.cpp +39 -0
package/src/llama.cpp/tests/test-opt.cpp +853 -142
package/src/llama.cpp/tests/test-quantize-fns.cpp +28 -19
package/src/llama.cpp/tests/test-quantize-perf.cpp +16 -14
package/src/llama.cpp/tests/test-rope.cpp +2 -1
package/src/llama.cpp/tests/test-sampling.cpp +226 -142
package/src/llama.cpp/tests/test-tokenizer-0.cpp +56 -36
package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +5 -5
package/src/llama.cpp/tests/test-tokenizer-1-spm.cpp +5 -5
package/patches/llama.patch +0 -22
package/src/llama.cpp/.github/workflows/bench.yml +0 -310
package/src/llama.cpp/common/grammar-parser.cpp +0 -536
package/src/llama.cpp/common/grammar-parser.h +0 -29
package/src/llama.cpp/common/train.cpp +0 -1513
package/src/llama.cpp/common/train.h +0 -233
package/src/llama.cpp/examples/baby-llama/baby-llama.cpp +0 -1640
package/src/llama.cpp/examples/benchmark/CMakeLists.txt +0 -6
package/src/llama.cpp/examples/benchmark/benchmark-matmult.cpp +0 -275
package/src/llama.cpp/ggml/src/llamafile/sgemm.cpp +0 -1027
package/src/llama.cpp/tests/test-grad0.cpp +0 -1566
/package/src/llama.cpp/ggml/{cmake → src/ggml-cpu/cmake}/FindSIMD.cmake +0 -0
/package/src/llama.cpp/ggml/src/{llamafile → ggml-cpu/llamafile}/sgemm.h +0 -0

package/src/llama.cpp/src/llama-vocab.cpp CHANGED Viewed

@@ -16,20 +16,6 @@
 // helpers
 //
-static void replace_all(std::string & s, const std::string & search, const std::string & replace) {
-    std::string result;
-    for (size_t pos = 0; ; pos += search.length()) {
-        auto new_pos = s.find(search, pos);
-        if (new_pos == std::string::npos) {
-            result += s.substr(pos, s.size() - pos);
-            break;
-        }
-        result += s.substr(pos, new_pos - pos) + replace;
-        pos = new_pos;
-    }
-    s = std::move(result);
-}
 LLAMA_ATTRIBUTE_FORMAT(1, 2)
 static std::string format(const char * fmt, ...) {
     va_list ap;
@@ -64,7 +50,7 @@ struct naive_trie {
             res.first->second.insert(key + 1, len - 1, value);
         }
     }
-    std::pair<const char *, size_t> get_longest_prefix(const char * key, size_t len, size_t offset = 0) {
+    std::pair<const char *, size_t> get_longest_prefix(const char * key, size_t len, size_t offset = 0) const {
         if (len == 0 || offset == len) {
             return std::make_pair(key, offset);
         }
@@ -72,17 +58,17 @@ struct naive_trie {
         auto res = children.find(c);
         if (res != children.end()) {
             return res->second.get_longest_prefix(key, len, offset + 1);
-        } else {
-            return std::make_pair(key, offset);
         }
+        return std::make_pair(key, offset);
     }
-    struct naive_trie * traverse(const char c) {
+    const struct naive_trie * traverse(const char c) const {
         auto res = children.find(c);
         if (res != children.end()) {
             return &res->second;
-        } else {
-            return NULL;
         }
+        return NULL;
     }
     std::map<char, struct naive_trie> children;
     bool has_value;
@@ -93,6 +79,15 @@ struct naive_trie {
 // impl
 //
+struct llm_tokenizer {
+   llm_tokenizer() {}
+   virtual ~llm_tokenizer() = default;
+};
+llama_vocab::~llama_vocab() {
+    delete tokenizer;
+}
 int llama_vocab::find_bpe_rank(const std::string & token_left, const std::string & token_right) const {
     GGML_ASSERT(token_left.find(' ')   == std::string::npos);
     GGML_ASSERT(token_left.find('\n')  == std::string::npos);
@@ -201,10 +196,15 @@ struct llm_bigram_spm {
     size_t size;
 };
-struct llm_tokenizer_spm {
-    llm_tokenizer_spm(const llama_vocab & vocab) : vocab(vocab) {}
+struct llm_tokenizer_spm : llm_tokenizer {
+    llm_tokenizer_spm(const llama_vocab & /*vocab*/) : llm_tokenizer() {}
+};
+struct llm_tokenizer_spm_session {
+    llm_tokenizer_spm_session(const llama_vocab & vocab) : vocab(vocab) {}
     void tokenize(const std::string & text, std::vector<llama_vocab::id> & output) {
         // split string into utf8 chars
         int index = 0;
         size_t offs = 0;
@@ -221,7 +221,7 @@ struct llm_tokenizer_spm {
         }
         // seed the work queue with all possible 2-character tokens.
-        for (size_t i = 1; i < symbols.size(); ++i) {
+        for (int i = 1; i < (int) symbols.size(); ++i) {
             try_add_bigram(i - 1, i);
         }
@@ -285,7 +285,7 @@ private:
             return;
         }
-        resegment(symbols[p->second.first],  output);
+        resegment(symbols[p->second.first], output);
         resegment(symbols[p->second.second], output);
     }
@@ -293,7 +293,6 @@ private:
         if (left == -1 || right == -1) {
             return;
         }
         const std::string text = std::string(symbols[left].text, symbols[left].n + symbols[right].n);
         auto token = vocab.token_to_id.find(text);
@@ -320,10 +319,11 @@ private:
     }
     const llama_vocab & vocab;
+    // currently unused
+    // const llm_tokenizer_spm * spm_tokenizer;
     std::vector<llm_symbol> symbols;
     llm_bigram_spm::queue work_queue;
     std::map<std::string, std::pair<int, int>> rev_merge;
 };
@@ -335,6 +335,21 @@ private:
 // TODO: there are a lot of common parts between spm and bpe tokenizers, should be refactored and reused
+template<typename T, typename Container = std::vector<T>, typename Compare = std::less<typename Container::value_type>>
+class llama_priority_queue : public std::priority_queue<T, Container, Compare> {
+public:
+    using std::priority_queue<T, Container, Compare>::priority_queue;
+    T pop_move() {
+        T item = std::move(this->c.front());
+        std::pop_heap(this->c.begin(), this->c.end(), this->comp);
+        this->c.pop_back();
+        return item;
+    }
+    void pop() =  delete;
+};
 struct llm_bigram_bpe {
     struct comparator {
         bool operator()(const llm_bigram_bpe & l, const llm_bigram_bpe & r) const {
@@ -343,7 +358,7 @@ struct llm_bigram_bpe {
     };
     using queue_storage = std::vector<llm_bigram_bpe>;
-    using queue = std::priority_queue<llm_bigram_bpe, queue_storage, comparator>;
+    using queue = llama_priority_queue<llm_bigram_bpe, queue_storage, comparator>;
     llm_symbol::index left;
     llm_symbol::index right;
     std::string text;
@@ -351,8 +366,8 @@ struct llm_bigram_bpe {
     size_t size;
 };
-struct llm_tokenizer_bpe {
-    llm_tokenizer_bpe(const llama_vocab & vocab): vocab(vocab) {
+struct llm_tokenizer_bpe : llm_tokenizer {
+    llm_tokenizer_bpe(const llama_vocab & vocab) : llm_tokenizer() {
         GGML_ASSERT(vocab.type == LLAMA_VOCAB_TYPE_BPE);
         switch (vocab.type_pre) {
             case LLAMA_VOCAB_PRE_TYPE_LLAMA3:
@@ -402,6 +417,7 @@ struct llm_tokenizer_bpe {
             case LLAMA_VOCAB_PRE_TYPE_COMMAND_R:
             case LLAMA_VOCAB_PRE_TYPE_SMOLLM:
             case LLAMA_VOCAB_PRE_TYPE_CODESHELL:
+            case LLAMA_VOCAB_PRE_TYPE_EXAONE:
                 regex_exprs = {
                     "\\p{N}",
                     "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
@@ -424,6 +440,8 @@ struct llm_tokenizer_bpe {
                 };
                 break;
             case LLAMA_VOCAB_PRE_TYPE_PORO:
+            case LLAMA_VOCAB_PRE_TYPE_BLOOM:
+            case LLAMA_VOCAB_PRE_TYPE_GPT3_FINNISH:
                 regex_exprs = {
                     " ?[^(\\s|.,!?…。，、।۔،)]+",
                 };
@@ -446,6 +464,20 @@ struct llm_tokenizer_bpe {
                     "[^\\r\\n\\p{L}\\p{N}]?((?=[\\p{L}])([^a-z]))*((?=[\\p{L}])([^A-Z]))+|[^\\r\\n\\p{L}\\p{N}]?((?=[\\p{L}])([^a-z]))+((?=[\\p{L}])([^A-Z]))*|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
                 };
                 break;
+            case LLAMA_VOCAB_PRE_TYPE_CHAMELEON:
+                // Note: in theory, the special token (sentinel and image token) regex_exprs below
+                // are unnecessary, as they are split in `tokenizer_st_partition` anyway.
+                // However, since the upstream pre-tokenizer uses them, they are also
+                // included here (see https://huggingface.co/facebook/chameleon-7b).
+                regex_exprs = {
+                    "<sentinel:[0-9]+>",  // Sentinel tokens
+                    "(IMGIMG)((A|B|C|D|E|F|G|H|I){1,4})Z",  // Image tokens
+                    "([\\t\\n]|    |  )",  // directly from tokenizer.json
+                    "\\p{N}", // Individual digits
+                    "[\\p{P}!-/:-@\\[-`{-~]",  // Punctuation, Isolated
+                    "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
+                };
+                break;
             default:
                 // default regex for BPE tokenization pre-processing
                 regex_exprs = {
@@ -458,7 +490,14 @@ struct llm_tokenizer_bpe {
         }
     }
-    void append(const llama_vocab::id token_id, std::vector<llama_vocab::id> & output) const {
+    std::vector<std::string> regex_exprs;
+};
+struct llm_tokenizer_bpe_session {
+    llm_tokenizer_bpe_session(const llama_vocab & vocab) : vocab(vocab),
+        bpe_tokenizer(static_cast<const llm_tokenizer_bpe *>(vocab.tokenizer)) {}
+    static void append(const llama_vocab::id token_id, std::vector<llama_vocab::id> & output)  {
         output.push_back(token_id);
     }
@@ -497,12 +536,11 @@ struct llm_tokenizer_bpe {
     void tokenize(const std::string & text, std::vector<llama_vocab::id> & output) {
         int final_prev_index = -1;
-        const auto word_collection = unicode_regex_split(text, regex_exprs);
+        const auto word_collection = unicode_regex_split(text, bpe_tokenizer->regex_exprs);
         symbols_final.clear();
-        for (auto & word : word_collection) {
+        for (const auto & word : word_collection) {
             work_queue = llm_bigram_bpe::queue();
             symbols.clear();
@@ -525,14 +563,13 @@ struct llm_tokenizer_bpe {
                 index++;
                 symbols.emplace_back(sym);
             }
-            for (size_t i = 1; i < symbols.size(); ++i) {
+            for (int i = 1; i < (int) symbols.size(); ++i) {
                 add_new_bigram(i - 1, i);
             }
             // build token(s)
             while (!work_queue.empty()) {
-                auto bigram = work_queue.top();
-                work_queue.pop();
+                auto bigram = work_queue.pop_move();
                 auto & left_symbol = symbols[bigram.left];
                 auto & right_symbol = symbols[bigram.right];
@@ -606,7 +643,6 @@ private:
         if (left == -1 || right == -1) {
             return;
         }
         std::string left_token  = std::string(symbols[left].text,  symbols[left].n);
         std::string right_token = std::string(symbols[right].text, symbols[right].n);
@@ -630,12 +666,10 @@ private:
     }
     const llama_vocab & vocab;
-    std::vector<std::string> regex_exprs;
+    const llm_tokenizer_bpe * bpe_tokenizer;
     std::vector<llm_symbol> symbols;
     std::vector<llm_symbol> symbols_final;
     llm_bigram_bpe::queue work_queue;
 };
@@ -643,15 +677,17 @@ private:
 // WPM tokenizer
 //
-struct llm_tokenizer_wpm {
-    llm_tokenizer_wpm(const llama_vocab & vocab): vocab(vocab) {}
+struct llm_tokenizer_wpm : llm_tokenizer {
+    llm_tokenizer_wpm(const llama_vocab & /*vocab*/) : llm_tokenizer() {}
+};
-    void tokenize(const std::string & text, std::vector<llama_vocab::id> & output) const {
-        const auto & token_map = vocab.token_to_id;
+struct llm_tokenizer_wpm_session {
+    llm_tokenizer_wpm_session(const llama_vocab & vocab) : vocab(vocab) {}
+    void tokenize(const std::string & text, std::vector<llama_vocab::id> & output) {
+        const auto & token_map = vocab.token_to_id;
         // normalize and split by whitespace
         std::vector<std::string> words = preprocess(text);
         // bos token prepended already
         // find the longest tokens that form the words
@@ -696,7 +732,7 @@ struct llm_tokenizer_wpm {
     }
     // TODO: reduce string copies by using cpts_offs array
-    std::vector<std::string> preprocess(const std::string & text) const {
+    static std::vector<std::string> preprocess(const std::string & text)  {
         const std::vector<uint32_t> cpts_nfd = unicode_cpts_normalize_nfd(unicode_cpts_from_utf8(text));
         std::vector<std::string> words(1, "");
@@ -748,15 +784,18 @@ struct llm_tokenizer_wpm {
             //(cpt >= 0xFF00  && cpt <= 0xFFEF);
     }
+private:
     const llama_vocab & vocab;
+    // currently unused
+    // const llm_tokenizer_wpm * wpm_tokenizer;
 };
 //
 // UGM tokenizer
 //
-struct llm_tokenizer_ugm {
-    llm_tokenizer_ugm(const llama_vocab & vocab) : vocab(vocab) {
+struct llm_tokenizer_ugm : llm_tokenizer {
+    llm_tokenizer_ugm(const llama_vocab & vocab) : llm_tokenizer() {
         if (vocab.precompiled_charsmap.size() > 0) {
             size_t charsmap_offset = 0;
@@ -802,6 +841,30 @@ struct llm_tokenizer_ugm {
         unknown_token_score = min_score - unknown_token_score_penalty;
     }
+    // escaped space symbol - U+2581 (Lower One Eighth Block)
+    const std::string escaped_space = "\xE2\x96\x81";
+    const char * prefix_replacements = NULL;
+    size_t prefix_replacements_size = 0;
+    const uint32_t * xcda_array = NULL;
+    size_t xcda_array_size = 0;
+    struct naive_trie user_defined_token_matcher;
+    float min_score = FLT_MAX;
+    float max_score = -FLT_MAX;
+    float unknown_token_score_penalty = 10.0;
+    float unknown_token_score;
+    struct naive_trie token_matcher;
+};
+struct llm_tokenizer_ugm_session {
+    llm_tokenizer_ugm_session(const llama_vocab & vocab) : vocab(vocab),
+        ugm_tokenizer(static_cast<const llm_tokenizer_ugm *>(vocab.tokenizer)) {}
     /* This implementation is based on SentencePiece optimized Viterbi algorithm for
      * unigram language models. The general idea is to:
      * - move along the input sequence in steps of one UTF code point,
@@ -816,6 +879,9 @@ struct llm_tokenizer_ugm {
      * the best tokenization.
     */
     void tokenize(const std::string & text, std::vector<llama_vocab::id> & output) {
+        // get current size of output (for reversal later)
+        size_t output_size = output.size();
         // normalize the input first
         std::string normalized;
         normalize(text, &normalized);
@@ -837,7 +903,7 @@ struct llm_tokenizer_ugm {
             // traverse the token matcher trie to find a matching token
             bool single_codepoint_token_found = false;
             const struct best_tokenization & current_best = tokenization_results[input_offset];
-            struct naive_trie * node  = token_matcher.traverse(normalized[prefix_offset++]);
+            const struct naive_trie * node = ugm_tokenizer->token_matcher.traverse(normalized[prefix_offset++]);
             while (prefix_offset <= input_len && node != NULL) {
                 // check if we found valid token in prefix
@@ -867,7 +933,7 @@ struct llm_tokenizer_ugm {
             // if we didn't find a valid token corresponding to the whole UTF code point
             // then use unknown token as the tokenization of this UTF code point
             if (!single_codepoint_token_found) {
-                const double challenger_score = current_best.score_sum + unknown_token_score;
+                const double challenger_score = current_best.score_sum + ugm_tokenizer->unknown_token_score;
                 prefix_offset = input_offset + n_utf8_code_units;
                 struct best_tokenization & current_champ = tokenization_results[prefix_offset];
                 if (challenger_score > current_champ.score_sum) {
@@ -895,11 +961,10 @@ struct llm_tokenizer_ugm {
         }
         // reverse the output since we added tokens starting from the end of the input
-        std::reverse(output.begin(), output.end());
+        std::reverse(output.begin() + output_size, output.end());
     }
 private:
-    const llama_vocab & vocab;
     // helper structure for returning normalization results
     struct normalization_result {
@@ -912,7 +977,7 @@ private:
         normalized->clear();
         normalized->reserve(input.size() * 3);
-        const std::string space = vocab.tokenizer_escape_whitespaces ? escaped_space : " ";
+        const std::string space = vocab.tokenizer_escape_whitespaces ? ugm_tokenizer->escaped_space : " ";
         bool shall_prepend_space = !vocab.tokenizer_treat_whitespace_as_suffix && vocab.tokenizer_add_space_prefix;
         bool shall_append_space = vocab.tokenizer_treat_whitespace_as_suffix && vocab.tokenizer_add_space_prefix;
@@ -957,7 +1022,7 @@ private:
     /*
      * This structure is a view wrapper for XOR-compressed double array (XCDA)
      * See Shunsuke Kanda (2018). Space- and Time-Efficient String Dictionaries.
-     * Eeach bit-packed entry contains:
+     * Each bit-packed entry contains:
      * - BASE array value in bits 10-30
      * - LCHECK array value in bits 0-7
      * - LEAF array value in bit 9
@@ -994,13 +1059,21 @@ private:
         size_t xcda_array_size;
     };
+    // this structure stores the best tokenization so far at input_offset
+    struct best_tokenization {
+        llama_token token_id;
+        size_t input_offset;
+        float score_sum;
+    };
     struct normalization_result normalize_prefix(const std::string & input, size_t input_offset) {
         if (input_offset == input.size()) {
             return { &input[input_offset], 0, 0 };
         }
         // if input prefix matches some user-defined token return this token as normalization result
-        auto user_defined_token_match = user_defined_token_matcher.get_longest_prefix(&input[input_offset], input.size() - input_offset);
+        auto user_defined_token_match =
+           ugm_tokenizer->user_defined_token_matcher.get_longest_prefix(&input[input_offset], input.size() - input_offset);
         if (user_defined_token_match.second > 0) {
             return { &input[input_offset], user_defined_token_match.second, user_defined_token_match.second };
         }
@@ -1008,8 +1081,8 @@ private:
         size_t longest_prefix_length = 0;
         size_t longest_prefix_offset = 0;
-        if (xcda_array_size > 0) {
-            struct xcda_array_view xcda_view(xcda_array, xcda_array_size);
+        if (ugm_tokenizer->xcda_array_size > 0) {
+            struct xcda_array_view xcda_view(ugm_tokenizer->xcda_array, ugm_tokenizer->xcda_array_size);
             // Find the longest normalized sequence matching the input prefix by walking
             // the XOR-compressed compact double array (XCDA) starting from the root node
@@ -1045,52 +1118,162 @@ private:
         if (longest_prefix_length > 0) {
             // we have a match, so return the replacement sequence
-            if (longest_prefix_offset >= prefix_replacements_size) {
+            if (longest_prefix_offset >= ugm_tokenizer->prefix_replacements_size) {
                 throw std::runtime_error("Index out of array bounds in precompiled charsmap!");
             }
-            const char * prefix_replacement = &prefix_replacements[longest_prefix_offset];
+            const char * prefix_replacement = &(ugm_tokenizer->prefix_replacements)[longest_prefix_offset];
             return { prefix_replacement, strlen(prefix_replacement), longest_prefix_length };
-        } else {
-            // check if the input prefix contains a valid sequence of UTF-8 code units
-            try {
-                // if yes, return this sequence unmodified
-                size_t prefix_offset = input_offset;
-                unicode_cpt_from_utf8(input, prefix_offset);
-                return { &input[input_offset], prefix_offset - input_offset, prefix_offset - input_offset };
-            } catch (std::invalid_argument & /*ex*/) {
-                // if no, consume 1 byte and return U+FFFD - REPLACEMENT CHARACTER
-                return { "\xEF\xBF\xBD", 3, 1 };
-            }
+        }
+        // check if the input prefix contains a valid sequence of UTF-8 code units
+        try {
+            // if yes, return this sequence unmodified
+            size_t prefix_offset = input_offset;
+            unicode_cpt_from_utf8(input, prefix_offset);
+            return { &input[input_offset], prefix_offset - input_offset, prefix_offset - input_offset };
+        } catch (std::invalid_argument & /*ex*/) {
+            // if no, consume 1 byte and return U+FFFD - REPLACEMENT CHARACTER
+            return { "\xEF\xBF\xBD", 3, 1 };
         }
     }
-    // escaped space symbol - U+2581 (Lower One Eighth Block)
-    const std::string escaped_space = "\xE2\x96\x81";
+    const llama_vocab & vocab;
+    const llm_tokenizer_ugm * ugm_tokenizer;
+};
-    const char * prefix_replacements = NULL;
-    size_t prefix_replacements_size = 0;
+//
+// RWKV tokenizer
+//
-    const uint32_t * xcda_array = NULL;
-    size_t xcda_array_size = 0;
+static std::vector<uint8_t> llama_unescape_rwkv_token(const std::string & escaped) {
+    std::vector<uint8_t> output;
+    output.reserve(escaped.size());
+    // Parser state
+    bool escaping = false;
+    uint8_t hex_remaining = 0;
+    uint8_t hex_acc = 0;
+    // Step through characters, performing parsing
+    for (const char & c : escaped) {
+        // If we're parsing a hex code, interpret the next character
+        if (hex_remaining != 0) {
+            uint8_t value = (c >= 'a') ? (c - 'a' + 10) : (c - '0');
+            hex_acc = (hex_acc << 4) + value;
+            hex_remaining -= 1;
+            if (hex_remaining == 0) {
+                output.push_back(hex_acc);
+                hex_acc = 0;
+            }
-    struct naive_trie user_defined_token_matcher;
+            continue;
+        }
-    // this structure stores the best tokenization so far at input_offset
-    struct best_tokenization {
-        llama_token token_id;
-        size_t input_offset;
-        float score_sum;
-    };
+        // If we got an escape character, interpret it
+        if (escaping) {
+            if (c == 't') {
+                output.push_back('\t');
+            } else if (c == 'n') {
+                output.push_back('\n');
+            } else if (c == 'r') {
+                output.push_back('\r');
+            } else if (c == 'x') {
+                hex_remaining = 2;
+            } else {
+                output.push_back(c);
+            }
-    float min_score = FLT_MAX;
-    float max_score = -FLT_MAX;
+            escaping = false;
+            continue;
+        }
-    float unknown_token_score_penalty = 10.0;
-    float unknown_token_score;
+        if (c == '\\') {
+            escaping = true;
+            continue;
+        }
+        output.push_back(c);
+    }
+    return output;
+}
+struct llm_tokenizer_rwkv : llm_tokenizer {
+    llm_tokenizer_rwkv(const llama_vocab & vocab) : llm_tokenizer() {
+        // RWKV supports arbitrary byte tokens, but the vocab struct only supports string tokens.
+        // For now, we decode the vocab here into the lookup we'll use for tokenization.
+        // build trie
+        for (unsigned int id = 0; id < vocab.id_to_token.size(); ++id) {
+            const auto & token = vocab.id_to_token[id];
+            const auto data = llama_unescape_rwkv_token(token.text);
+            token_matcher.insert((const char *) data.data(), data.size(), id);
+        }
+    }
     struct naive_trie token_matcher;
 };
+struct llm_tokenizer_rwkv_session {
+    llm_tokenizer_rwkv_session(const llama_vocab & vocab) : vocab(vocab),
+        rwkv_tokenizer(static_cast<const llm_tokenizer_rwkv &>(*vocab.tokenizer)) {}
+    void tokenize(const std::string & text, std::vector<llama_vocab::id> & output) {
+        uint32_t position = 0;
+        while (position < text.size()) {
+            const struct naive_trie * node = rwkv_tokenizer.token_matcher.traverse(text[position]);
+            if (node == NULL) {
+                // no matching token found, add unknown token
+                output.push_back(vocab.special_unk_id);
+                position += 1;
+                continue;
+            }
+            // traverse the trie to find the longest matching token
+            uint32_t token_id = 0;
+            uint32_t token_length = 0;
+            while (node != NULL) {
+                if (node->has_value) {
+                    token_id = node->value;
+                    token_length = position + 1;
+                }
+                node = node->traverse(text[++position]);
+            }
+            // add the longest matching token
+            output.push_back(token_id);
+            position = token_length;
+        }
+    }
+private:
+    const llama_vocab & vocab;
+    const llm_tokenizer_rwkv & rwkv_tokenizer;
+};
+void llama_vocab::init_tokenizer() {
+    switch (type) {
+        case LLAMA_VOCAB_TYPE_SPM:
+            tokenizer = new llm_tokenizer_spm(*this);
+            break;
+        case LLAMA_VOCAB_TYPE_BPE:
+            tokenizer = new llm_tokenizer_bpe(*this);
+            break;
+        case LLAMA_VOCAB_TYPE_WPM:
+            tokenizer = new llm_tokenizer_wpm(*this);
+            break;
+        case LLAMA_VOCAB_TYPE_UGM:
+            tokenizer = new llm_tokenizer_ugm(*this);
+            break;
+        case LLAMA_VOCAB_TYPE_RWKV:
+            tokenizer = new llm_tokenizer_rwkv(*this);
+            break;
+        default:
+            GGML_ABORT("unsupported vocab type");
+    }
+}
 //
 // (de-) tokenize
 //
@@ -1152,7 +1335,7 @@ static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list<
             // if a fragment is text ( not yet processed )
             if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
-                auto & raw_text = fragment.raw_text;
+                const auto & raw_text = fragment.raw_text;
                 auto raw_text_base_offset = fragment.offset;
                 auto raw_text_base_length = fragment.length;
@@ -1251,7 +1434,13 @@ static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list<
     }
 }
-std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab & vocab, std::string raw_text, bool add_special, bool parse_special) {
+std::vector<llama_vocab::id> llama_tokenize_internal(
+        const llama_vocab & vocab,
+        std::string raw_text,
+        bool add_special,
+        bool parse_special) {
+    GGML_ASSERT(vocab.tokenizer && "Tokenizer not initialized. Call llama_vocab::init_tokenizer() first.");
     std::vector<llama_vocab::id> output;
     std::forward_list<fragment_buffer_variant> fragment_buffer;
@@ -1288,9 +1477,9 @@ std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab & vocab,
 #ifdef PRETOKENIZERDEBUG
                         LLAMA_LOG_WARN("TT: (%ld %ld %ld) '%s'\n", raw_text.length(), fragment.offset, fragment.length, raw_text.c_str());
 #endif
-                        llm_tokenizer_spm tokenizer(vocab);
                         llama_escape_whitespace(raw_text);
-                        tokenizer.tokenize(raw_text, output);
+                        llm_tokenizer_spm_session session(vocab);
+                        session.tokenize(raw_text, output);
                         is_prev_special = false;
                     } else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
                         output.push_back(fragment.token);
@@ -1312,10 +1501,11 @@ std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab & vocab,
             } break;
         case LLAMA_VOCAB_TYPE_BPE:
             {
-                llm_tokenizer_bpe tokenizer(vocab);
+                llm_tokenizer_bpe_session session(vocab);
+                // it calls some other methods that are not exist in llm_tokenizer,
+                // here just cast it to bpe tokenizer object
                 if (add_special) {
-                    tokenizer.append_bos(output);
+                    session.append_bos(output);
                 }
                 for (const auto & fragment : fragment_buffer) {
                     if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
@@ -1324,15 +1514,15 @@ std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab & vocab,
 #ifdef PRETOKENIZERDEBUG
                         LLAMA_LOG_WARN("TT: (%ld %ld %ld) '%s'\n", raw_text.length(), fragment.offset, fragment.length, raw_text.c_str());
 #endif
-                        tokenizer.tokenize(raw_text, output);
+                        session.tokenize(raw_text, output);
                     } else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
-                        tokenizer.append(fragment.token, output);
+                        session.append(fragment.token, output);
                     }
                 }
                 if (add_special) {
-                    tokenizer.append_eos(output);
-                    tokenizer.check_double_bos_eos(output);
+                    session.append_eos(output);
+                    session.check_double_bos_eos(output);
                 }
             } break;
         case LLAMA_VOCAB_TYPE_WPM:
@@ -1342,7 +1532,7 @@ std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab & vocab,
                     output.push_back(vocab.special_cls_id);
                 }
-                llm_tokenizer_wpm tokenizer(vocab);
+                llm_tokenizer_wpm_session session(vocab);
                 for (const auto & fragment : fragment_buffer) {
                     if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
@@ -1351,7 +1541,7 @@ std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab & vocab,
 #ifdef PRETOKENIZERDEBUG
                         LLAMA_LOG_WARN("TT: (%ld %ld %ld) '%s'\n", raw_text.length(), fragment.offset, fragment.length, raw_text.c_str());
 #endif
-                        tokenizer.tokenize(raw_text, output);
+                        session.tokenize(raw_text, output);
                     } else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
                         output.push_back(fragment.token);
                     }
@@ -1364,12 +1554,11 @@ std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab & vocab,
             } break;
         case LLAMA_VOCAB_TYPE_UGM:
             {
-                llm_tokenizer_ugm tokenizer(vocab);
-                if (add_special && vocab.tokenizer_add_bos != 0) {
+                if (add_special && vocab.tokenizer_add_bos) {
                     GGML_ASSERT(vocab.special_bos_id != -1);
                     output.push_back(vocab.special_bos_id);
                 }
+                llm_tokenizer_ugm_session session(vocab);
                 for (const auto & fragment : fragment_buffer) {
                     if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
@@ -1377,24 +1566,41 @@ std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab & vocab,
 #ifdef PRETOKENIZERDEBUG
                         LLAMA_LOG_WARN("TT: (%ld %ld %ld) '%s'\n", raw_text.length(), fragment.offset, fragment.length, raw_text.c_str());
 #endif
-                        tokenizer.tokenize(raw_text, output);
+                        session.tokenize(raw_text, output);
                     } else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
                         output.push_back(fragment.token);
                     }
                 }
-                if (add_special && vocab.tokenizer_add_bos != 0 && output.size() >= 2 && output[1] == vocab.special_bos_id) {
+                if (add_special && vocab.tokenizer_add_bos && output.size() >= 2 && output[1] == vocab.special_bos_id) {
                     LLAMA_LOG_WARN(
                         "%s: Added a BOS token to the prompt as specified by the model but the prompt "
                         "also starts with a BOS token. So now the final prompt starts with 2 BOS tokens. "
                         "Are you sure this is what you want?\n", __FUNCTION__);
                 }
-                if (add_special && vocab.tokenizer_add_eos == 1) {
+                if (add_special && vocab.tokenizer_add_eos) {
                     GGML_ASSERT(vocab.special_eos_id != -1);
                     output.push_back(vocab.special_eos_id);
                 }
             } break;
+        case LLAMA_VOCAB_TYPE_RWKV:
+            {
+                llm_tokenizer_rwkv_session session(vocab);
+                for (const auto & fragment : fragment_buffer) {
+                    if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
+                        auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length);
+#ifdef PRETOKENIZERDEBUG
+                        LLAMA_LOG_WARN("TT: (%ld %ld %ld) '%s'\n", raw_text.length(), fragment.offset, fragment.length, raw_text.c_str());
+#endif
+                        session.tokenize(raw_text, output);
+                    } else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
+                        output.push_back(fragment.token);
+                    }
+                }
+            } break;
         case LLAMA_VOCAB_TYPE_NONE:
             GGML_ABORT("fatal error");
     }
@@ -1442,10 +1648,7 @@ llama_token_attr llama_token_get_attr_impl(const struct llama_vocab & vocab, lla
 }
 bool llama_token_is_eog_impl(const struct llama_vocab & vocab, llama_token token) {
-    return token != -1 && (
-        token == llama_token_eos_impl(vocab) ||
-        token == llama_token_eot_impl(vocab)
-    );
+    return token != -1 && vocab.special_eog_ids.count(token) > 0;
 }
 bool llama_token_is_control_impl(const struct llama_vocab & vocab, llama_token token) {
@@ -1460,6 +1663,14 @@ llama_token llama_token_eos_impl(const struct llama_vocab & vocab) {
     return vocab.special_eos_id;
 }
+llama_token llama_token_eot_impl(const struct llama_vocab & vocab) {
+    return vocab.special_eot_id;
+}
+llama_token llama_token_eom_impl(const struct llama_vocab & vocab) {
+    return vocab.special_eom_id;
+}
 llama_token llama_token_cls_impl(const struct llama_vocab & vocab) {
     return vocab.special_cls_id;
 }
@@ -1476,38 +1687,58 @@ llama_token llama_token_pad_impl(const struct llama_vocab & vocab) {
     return vocab.special_pad_id;
 }
-int32_t llama_add_bos_token_impl(const struct llama_vocab & vocab) {
+bool llama_add_bos_token_impl(const struct llama_vocab & vocab) {
     return vocab.tokenizer_add_bos;
 }
-int32_t llama_add_eos_token_impl(const struct llama_vocab & vocab) {
+bool llama_add_eos_token_impl(const struct llama_vocab & vocab) {
     return vocab.tokenizer_add_eos;
 }
 llama_token llama_token_prefix_impl(const struct llama_vocab & vocab) {
-    return vocab.special_prefix_id;
+    return vocab.special_fim_pre_id;
 }
 llama_token llama_token_middle_impl(const struct llama_vocab & vocab) {
-    return vocab.special_middle_id;
+    return vocab.special_fim_mid_id;
 }
 llama_token llama_token_suffix_impl(const struct llama_vocab & vocab) {
-    return vocab.special_suffix_id;
+    return vocab.special_fim_suf_id;
 }
-llama_token llama_token_eot_impl(const struct llama_vocab & vocab) {
-    return vocab.special_eot_id;
+llama_token llama_token_fim_pre_impl(const struct llama_vocab & vocab) {
+    return vocab.special_fim_pre_id;
+}
+llama_token llama_token_fim_suf_impl(const struct llama_vocab & vocab) {
+    return vocab.special_fim_suf_id;
+}
+llama_token llama_token_fim_mid_impl(const struct llama_vocab & vocab) {
+    return vocab.special_fim_mid_id;
+}
+llama_token llama_token_fim_pad_impl(const struct llama_vocab & vocab) {
+    return vocab.special_fim_pad_id;
+}
+llama_token llama_token_fim_rep_impl(const struct llama_vocab & vocab) {
+    return vocab.special_fim_rep_id;
+}
+llama_token llama_token_fim_sep_impl(const struct llama_vocab & vocab) {
+    return vocab.special_fim_sep_id;
 }
 int32_t llama_tokenize_impl(
-    const struct llama_vocab & vocab,
-                  const char * text,
-                     int32_t   text_len,
-                 llama_token * tokens,
-                     int32_t   n_tokens_max,
-                        bool   add_special,
-                        bool   parse_special) {
+        const struct llama_vocab & vocab,
+                      const char * text,
+                         int32_t   text_len,
+                     llama_token * tokens,
+                         int32_t   n_tokens_max,
+                            bool   add_special,
+                            bool   parse_special) {
     auto res = llama_tokenize_internal(vocab, std::string(text, text_len), add_special, parse_special);
     if (n_tokens_max < (int) res.size()) {
         // LLAMA_LOG_ERROR("%s: too many tokens\n", __func__);
@@ -1584,11 +1815,13 @@ int32_t llama_token_to_piece_impl(const struct llama_vocab & vocab, llama_token
                 // suppressing them like CONTROL tokens.
                 if (attr & (attr_special | LLAMA_TOKEN_ATTR_USER_DEFINED)) {
                     return _try_copy(token_text.data(), token_text.size());
-                } else if (attr & LLAMA_TOKEN_ATTR_NORMAL) {
+                }
+                if (attr & LLAMA_TOKEN_ATTR_NORMAL) {
                     std::string result = token_text;
                     llama_unescape_whitespace(result);
                     return _try_copy(result.data(), result.size());
-                } else if (attr & LLAMA_TOKEN_ATTR_BYTE) {
+                }
+                if (attr & LLAMA_TOKEN_ATTR_BYTE) {
                     char byte = (char) llama_token_to_byte(vocab, token);
                     return _try_copy((char*) &byte, 1);
                 }
@@ -1599,12 +1832,24 @@ int32_t llama_token_to_piece_impl(const struct llama_vocab & vocab, llama_token
                 // suppressing them like CONTROL tokens.
                 if (attr & (attr_special | LLAMA_TOKEN_ATTR_USER_DEFINED)) {
                     return _try_copy(token_text.data(), token_text.size());
-                } else if (attr & LLAMA_TOKEN_ATTR_NORMAL) {
+                }
+                if (attr & LLAMA_TOKEN_ATTR_NORMAL) {
                     std::string result = llama_decode_text(token_text);
                     return _try_copy(result.data(), result.size());
                 }
                 break;
             }
+            case LLAMA_VOCAB_TYPE_RWKV: {
+                std::vector<uint8_t> result = llama_unescape_rwkv_token(token_text);
+                // If we don't have enough space, return an error
+                if (result.size() > (size_t)length) {
+                    return -(int)result.size();
+                }
+                memcpy(buf, result.data(), result.size());
+                return (int)result.size();
+            }
             default:
                 GGML_ABORT("fatal error");
         }
@@ -1621,6 +1866,8 @@ int32_t llama_detokenize_impl(
                          int32_t   text_len_max,
                             bool   remove_special,
                             bool   unparse_special) {
+    GGML_ASSERT(vocab.tokenizer && "Tokenizer not initialized. Call llama_vocab::init_tokenizer() first.");
     int32_t avail = text_len_max;
     int32_t total = 0;
@@ -1719,3 +1966,19 @@ int32_t llama_detokenize_impl(
     return total <= text_len_max ? total : -total;
 }
+std::string llama_detokenize(const struct llama_vocab & vocab, const std::vector<llama_token> & tokens, bool special) {
+    std::string text;
+    text.resize(std::max(text.capacity(), tokens.size()));
+    int32_t n_chars = llama_detokenize_impl(vocab, tokens.data(), (int32_t)tokens.size(), &text[0], (int32_t)text.size(), false, special);
+    if (n_chars < 0) {
+        text.resize(-n_chars);
+        n_chars = llama_detokenize_impl(vocab, tokens.data(), (int32_t)tokens.size(), &text[0], (int32_t)text.size(), false, special);
+        GGML_ASSERT(n_chars <= (int32_t)text.size());  // whitespace trimming is performed after per-token detokenization
+    }
+    text.resize(n_chars);
+    // NOTE: the original tokenizer decodes bytes after collecting the pieces.
+    return text;
+}