npm - @fugood/llama.node - Versions diffs - 0.3.6 → 0.3.8 - Mend

@fugood/llama.node 0.3.6 → 0.3.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (186) hide show

package/README.md +17 -2
package/bin/darwin/arm64/llama-node.node +0 -0
package/bin/darwin/x64/llama-node.node +0 -0
package/bin/linux/arm64/llama-node.node +0 -0
package/bin/linux/x64/llama-node.node +0 -0
package/bin/linux-cuda/arm64/llama-node.node +0 -0
package/bin/linux-cuda/x64/llama-node.node +0 -0
package/bin/linux-vulkan/arm64/llama-node.node +0 -0
package/bin/linux-vulkan/x64/llama-node.node +0 -0
package/bin/win32/arm64/llama-node.node +0 -0
package/bin/win32/arm64/node.lib +0 -0
package/bin/win32/x64/llama-node.node +0 -0
package/bin/win32/x64/node.lib +0 -0
package/bin/win32-vulkan/arm64/llama-node.node +0 -0
package/bin/win32-vulkan/arm64/node.lib +0 -0
package/bin/win32-vulkan/x64/llama-node.node +0 -0
package/bin/win32-vulkan/x64/node.lib +0 -0
package/lib/binding.ts +3 -1
package/lib/index.js +16 -1
package/lib/index.ts +16 -0
package/package.json +1 -1
package/src/EmbeddingWorker.cpp +4 -3
package/src/LlamaCompletionWorker.cpp +4 -2
package/src/LlamaContext.cpp +61 -6
package/src/LlamaContext.h +1 -0
package/src/common.hpp +6 -11
package/src/llama.cpp/.github/workflows/build.yml +19 -17
package/src/llama.cpp/.github/workflows/docker.yml +77 -30
package/src/llama.cpp/.github/workflows/editorconfig.yml +3 -1
package/src/llama.cpp/.github/workflows/server.yml +22 -3
package/src/llama.cpp/CMakeLists.txt +49 -24
package/src/llama.cpp/common/arg.cpp +82 -26
package/src/llama.cpp/common/arg.h +3 -0
package/src/llama.cpp/common/common.cpp +192 -72
package/src/llama.cpp/common/common.h +51 -18
package/src/llama.cpp/common/ngram-cache.cpp +12 -12
package/src/llama.cpp/common/ngram-cache.h +2 -2
package/src/llama.cpp/common/sampling.cpp +11 -6
package/src/llama.cpp/common/speculative.cpp +18 -15
package/src/llama.cpp/docs/build.md +2 -0
package/src/llama.cpp/examples/batched/batched.cpp +9 -7
package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +3 -3
package/src/llama.cpp/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp +10 -8
package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +11 -8
package/src/llama.cpp/examples/cvector-generator/mean.hpp +1 -1
package/src/llama.cpp/examples/cvector-generator/pca.hpp +1 -1
package/src/llama.cpp/examples/embedding/embedding.cpp +8 -7
package/src/llama.cpp/examples/eval-callback/eval-callback.cpp +7 -6
package/src/llama.cpp/examples/export-lora/export-lora.cpp +8 -7
package/src/llama.cpp/examples/gguf/gguf.cpp +10 -6
package/src/llama.cpp/examples/gguf-hash/gguf-hash.cpp +1 -0
package/src/llama.cpp/examples/gguf-split/gguf-split.cpp +8 -7
package/src/llama.cpp/examples/gritlm/gritlm.cpp +13 -10
package/src/llama.cpp/examples/imatrix/imatrix.cpp +13 -12
package/src/llama.cpp/examples/infill/infill.cpp +23 -24
package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +44 -13
package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +11 -6
package/src/llama.cpp/examples/llava/clip.cpp +4 -2
package/src/llama.cpp/examples/llava/llava-cli.cpp +9 -6
package/src/llama.cpp/examples/llava/llava.cpp +2 -2
package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +8 -4
package/src/llama.cpp/examples/llava/qwen2vl-cli.cpp +11 -8
package/src/llama.cpp/examples/lookahead/lookahead.cpp +6 -7
package/src/llama.cpp/examples/lookup/lookup-create.cpp +4 -9
package/src/llama.cpp/examples/lookup/lookup-stats.cpp +3 -7
package/src/llama.cpp/examples/lookup/lookup.cpp +5 -6
package/src/llama.cpp/examples/main/main.cpp +51 -29
package/src/llama.cpp/examples/parallel/parallel.cpp +5 -6
package/src/llama.cpp/examples/passkey/passkey.cpp +7 -5
package/src/llama.cpp/examples/perplexity/perplexity.cpp +37 -23
package/src/llama.cpp/examples/quantize-stats/quantize-stats.cpp +12 -14
package/src/llama.cpp/examples/retrieval/retrieval.cpp +8 -8
package/src/llama.cpp/examples/rpc/rpc-server.cpp +12 -0
package/src/llama.cpp/examples/run/CMakeLists.txt +1 -1
package/src/llama.cpp/examples/run/linenoise.cpp/linenoise.cpp +1351 -0
package/src/llama.cpp/examples/run/linenoise.cpp/linenoise.h +114 -0
package/src/llama.cpp/examples/run/run.cpp +175 -61
package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +4 -25
package/src/llama.cpp/examples/server/CMakeLists.txt +1 -0
package/src/llama.cpp/examples/server/httplib.h +1295 -409
package/src/llama.cpp/examples/server/server.cpp +387 -181
package/src/llama.cpp/examples/server/tests/requirements.txt +1 -0
package/src/llama.cpp/examples/server/utils.hpp +170 -58
package/src/llama.cpp/examples/simple/simple.cpp +9 -8
package/src/llama.cpp/examples/simple-chat/simple-chat.cpp +16 -12
package/src/llama.cpp/examples/speculative/speculative.cpp +22 -23
package/src/llama.cpp/examples/speculative-simple/speculative-simple.cpp +8 -12
package/src/llama.cpp/examples/tokenize/tokenize.cpp +17 -5
package/src/llama.cpp/examples/tts/tts.cpp +64 -23
package/src/llama.cpp/ggml/CMakeLists.txt +5 -21
package/src/llama.cpp/ggml/include/ggml-backend.h +2 -0
package/src/llama.cpp/ggml/include/ggml-cpp.h +1 -0
package/src/llama.cpp/ggml/include/ggml.h +36 -145
package/src/llama.cpp/ggml/include/gguf.h +202 -0
package/src/llama.cpp/ggml/src/CMakeLists.txt +6 -3
package/src/llama.cpp/ggml/src/ggml-alloc.c +5 -0
package/src/llama.cpp/ggml/src/ggml-backend-impl.h +0 -1
package/src/llama.cpp/ggml/src/ggml-backend-reg.cpp +79 -49
package/src/llama.cpp/ggml/src/ggml-backend.cpp +5 -2
package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +33 -23
package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +57 -72
package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +87 -2
package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +335 -66
package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +10 -2
package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +1090 -378
package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.h +2 -2
package/src/llama.cpp/ggml/src/ggml-cuda/vendors/cuda.h +1 -0
package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +3 -0
package/src/llama.cpp/ggml/src/ggml-cuda/vendors/musa.h +3 -0
package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +3 -1
package/src/llama.cpp/ggml/src/ggml-impl.h +11 -16
package/src/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +16 -0
package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +6 -6
package/src/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +154 -35
package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +1 -0
package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +9 -3
package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +18 -0
package/src/llama.cpp/ggml/src/ggml-sycl/concat.cpp +3 -2
package/src/llama.cpp/ggml/src/ggml-sycl/concat.hpp +1 -2
package/src/llama.cpp/ggml/src/ggml-sycl/conv.cpp +3 -2
package/src/llama.cpp/ggml/src/ggml-sycl/conv.hpp +1 -2
package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +40 -95
package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +48 -48
package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +24 -24
package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +238 -164
package/src/llama.cpp/ggml/src/ggml-sycl/gla.cpp +105 -0
package/src/llama.cpp/ggml/src/ggml-sycl/gla.hpp +8 -0
package/src/llama.cpp/ggml/src/ggml-sycl/outprod.cpp +3 -3
package/src/llama.cpp/ggml/src/ggml-sycl/outprod.hpp +1 -2
package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.cpp +3 -2
package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.hpp +1 -2
package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.cpp +7 -5
package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.hpp +1 -2
package/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +74 -4
package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +314 -116
package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +4 -2
package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +9 -3
package/src/llama.cpp/ggml/src/ggml.c +117 -1327
package/src/llama.cpp/ggml/src/gguf.cpp +1329 -0
package/src/llama.cpp/include/llama-cpp.h +6 -1
package/src/llama.cpp/include/llama.h +138 -75
package/src/llama.cpp/src/CMakeLists.txt +13 -1
package/src/llama.cpp/src/llama-adapter.cpp +347 -0
package/src/llama.cpp/src/llama-adapter.h +74 -0
package/src/llama.cpp/src/llama-arch.cpp +1487 -0
package/src/llama.cpp/src/llama-arch.h +400 -0
package/src/llama.cpp/src/llama-batch.cpp +368 -0
package/src/llama.cpp/src/llama-batch.h +88 -0
package/src/llama.cpp/src/llama-chat.cpp +578 -0
package/src/llama.cpp/src/llama-chat.h +52 -0
package/src/llama.cpp/src/llama-context.cpp +1775 -0
package/src/llama.cpp/src/llama-context.h +128 -0
package/src/llama.cpp/src/llama-cparams.cpp +1 -0
package/src/llama.cpp/src/llama-cparams.h +37 -0
package/src/llama.cpp/src/llama-grammar.cpp +5 -4
package/src/llama.cpp/src/llama-grammar.h +3 -1
package/src/llama.cpp/src/llama-hparams.cpp +71 -0
package/src/llama.cpp/src/llama-hparams.h +139 -0
package/src/llama.cpp/src/llama-impl.cpp +167 -0
package/src/llama.cpp/src/llama-impl.h +16 -136
package/src/llama.cpp/src/llama-kv-cache.cpp +718 -0
package/src/llama.cpp/src/llama-kv-cache.h +218 -0
package/src/llama.cpp/src/llama-mmap.cpp +589 -0
package/src/llama.cpp/src/llama-mmap.h +67 -0
package/src/llama.cpp/src/llama-model-loader.cpp +1124 -0
package/src/llama.cpp/src/llama-model-loader.h +167 -0
package/src/llama.cpp/src/llama-model.cpp +3953 -0
package/src/llama.cpp/src/llama-model.h +370 -0
package/src/llama.cpp/src/llama-quant.cpp +934 -0
package/src/llama.cpp/src/llama-quant.h +1 -0
package/src/llama.cpp/src/llama-sampling.cpp +147 -32
package/src/llama.cpp/src/llama-sampling.h +3 -19
package/src/llama.cpp/src/llama-vocab.cpp +1832 -575
package/src/llama.cpp/src/llama-vocab.h +97 -142
package/src/llama.cpp/src/llama.cpp +7160 -20314
package/src/llama.cpp/src/unicode.cpp +8 -3
package/src/llama.cpp/tests/CMakeLists.txt +2 -0
package/src/llama.cpp/tests/test-autorelease.cpp +3 -3
package/src/llama.cpp/tests/test-backend-ops.cpp +370 -59
package/src/llama.cpp/tests/test-chat-template.cpp +162 -125
package/src/llama.cpp/tests/test-gguf.cpp +222 -187
package/src/llama.cpp/tests/test-model-load-cancel.cpp +1 -1
package/src/llama.cpp/tests/test-sampling.cpp +0 -1
package/src/llama.cpp/tests/test-tokenizer-0.cpp +4 -4
package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +9 -7
package/src/llama.cpp/tests/test-tokenizer-1-spm.cpp +8 -6

package/src/llama.cpp/include/llama-cpp.h CHANGED Viewed

@@ -9,7 +9,7 @@
 #include "llama.h"
 struct llama_model_deleter {
-    void operator()(llama_model * model) { llama_free_model(model); }
+    void operator()(llama_model * model) { llama_model_free(model); }
 };
 struct llama_context_deleter {
@@ -20,6 +20,11 @@ struct llama_sampler_deleter {
     void operator()(llama_sampler * sampler) { llama_sampler_free(sampler); }
 };
+struct llama_adapter_lora_deleter {
+    void operator()(llama_adapter_lora * adapter) { llama_adapter_lora_free(adapter); }
+};
 typedef std::unique_ptr<llama_model, llama_model_deleter> llama_model_ptr;
 typedef std::unique_ptr<llama_context, llama_context_deleter> llama_context_ptr;
 typedef std::unique_ptr<llama_sampler, llama_sampler_deleter> llama_sampler_ptr;
+typedef std::unique_ptr<llama_adapter_lora, llama_adapter_lora_deleter> llama_adapter_lora_ptr;

package/src/llama.cpp/include/llama.h CHANGED Viewed

@@ -34,7 +34,6 @@
 #define LLAMA_DEFAULT_SEED 0xFFFFFFFF
-// TODO: use everywhere in the implementation
 #define LLAMA_TOKEN_NULL -1
 #define LLAMA_FILE_MAGIC_GGLA 0x67676c61u // 'ggla'
@@ -57,7 +56,7 @@ extern "C" {
     // TODO: show sample usage
     //
-    // struct llama_vocab; // TODO: add in the future
+    struct llama_vocab;
     struct llama_model;
     struct llama_context;
     struct llama_sampler;
@@ -105,6 +104,7 @@ extern "C" {
         LLAMA_VOCAB_PRE_TYPE_EXAONE         = 25,
         LLAMA_VOCAB_PRE_TYPE_CHAMELEON      = 26,
         LLAMA_VOCAB_PRE_TYPE_MINERVA        = 27,
+        LLAMA_VOCAB_PRE_TYPE_DEEPSEEK3_LLM  = 28,
     };
     enum llama_rope_type {
@@ -288,9 +288,6 @@ extern "C" {
         // proportion of the model (layers or rows) to offload to each GPU, size: llama_max_devices()
         const float * tensor_split;
-        // comma separated list of RPC servers to use for offloading
-        const char * rpc_servers;
         // Called with a progress value between 0.0 and 1.0. Pass NULL to disable.
         // If the provided progress_callback returns true, model loading continues.
         // If it returns false, model loading is immediately aborted.
@@ -385,7 +382,7 @@ extern "C" {
     } llama_chat_message;
     // lora adapter
-    struct llama_lora_adapter;
+    struct llama_adapter_lora;
     // Helpers for getting default parameters
     // TODO: update API to start accepting pointers to params structs (https://github.com/ggerganov/llama.cpp/discussions/9172)
@@ -399,30 +396,53 @@ extern "C" {
     // Call once at the start of the program
     LLAMA_API void llama_backend_init(void);
+    // Call once at the end of the program - currently only used for MPI
+    LLAMA_API void llama_backend_free(void);
     //optional:
     LLAMA_API void llama_numa_init(enum ggml_numa_strategy numa);
     // Optional: an auto threadpool gets created in ggml if not passed explicitly
     LLAMA_API void llama_attach_threadpool(
-               struct   llama_context * ctx,
-            ggml_threadpool_t   threadpool,
-            ggml_threadpool_t   threadpool_batch);
+            struct llama_context * ctx,
+               ggml_threadpool_t   threadpool,
+               ggml_threadpool_t   threadpool_batch);
     LLAMA_API void llama_detach_threadpool(struct llama_context * ctx);
-    // Call once at the end of the program - currently only used for MPI
-    LLAMA_API void llama_backend_free(void);
+    DEPRECATED(LLAMA_API struct llama_model * llama_load_model_from_file(
+                             const char * path_model,
+              struct llama_model_params   params),
+            "use llama_model_load_from_file instead");
-    LLAMA_API struct llama_model * llama_load_model_from_file(
+    // Load the model from a file
+    // If the file is split into multiple parts, the file name must follow this pattern: <name>-%05d-of-%05d.gguf
+    // If the split file name does not follow this pattern, use llama_model_load_from_splits
+    LLAMA_API struct llama_model * llama_model_load_from_file(
                              const char * path_model,
               struct llama_model_params   params);
-    LLAMA_API void llama_free_model(struct llama_model * model);
+    // Load the model from multiple splits (support custom naming scheme)
+    // The paths must be in the correct order
+    LLAMA_API struct llama_model * llama_model_load_from_splits(
+                             const char ** paths,
+                                 size_t    n_paths,
+              struct llama_model_params    params);
-    // TODO: rename to llama_init_from_model
-    LLAMA_API struct llama_context * llama_new_context_with_model(
+    DEPRECATED(LLAMA_API void llama_free_model(struct llama_model * model),
+            "use llama_model_free instead");
+    LLAMA_API void llama_model_free(struct llama_model * model);
+    LLAMA_API struct llama_context * llama_init_from_model(
                      struct llama_model * model,
             struct llama_context_params   params);
+    DEPRECATED(LLAMA_API struct llama_context * llama_new_context_with_model(
+                     struct llama_model * model,
+            struct llama_context_params   params),
+            "use llama_init_from_model instead");
     // Frees all allocated memory
     LLAMA_API void llama_free(struct llama_context * ctx);
@@ -440,20 +460,30 @@ extern "C" {
     LLAMA_API uint32_t llama_n_ubatch   (const struct llama_context * ctx);
     LLAMA_API uint32_t llama_n_seq_max  (const struct llama_context * ctx);
-    LLAMA_API int32_t llama_n_vocab    (const struct llama_model * model);
-    LLAMA_API int32_t llama_n_ctx_train(const struct llama_model * model);
-    LLAMA_API int32_t llama_n_embd     (const struct llama_model * model);
-    LLAMA_API int32_t llama_n_layer    (const struct llama_model * model);
-    LLAMA_API int32_t llama_n_head     (const struct llama_model * model);
+    DEPRECATED(LLAMA_API int32_t llama_n_ctx_train(const struct llama_model * model), "use llama_model_n_ctx_train instead");
+    DEPRECATED(LLAMA_API int32_t llama_n_embd     (const struct llama_model * model), "use llama_model_n_embd instead");
+    DEPRECATED(LLAMA_API int32_t llama_n_layer    (const struct llama_model * model), "use llama_model_n_layer instead");
+    DEPRECATED(LLAMA_API int32_t llama_n_head     (const struct llama_model * model), "use llama_model_n_head instead");
-    LLAMA_API const struct llama_model * llama_get_model(const struct llama_context * ctx);
+    DEPRECATED(LLAMA_API int32_t llama_n_vocab    (const struct llama_vocab * vocab), "use llama_vocab_n_tokens instead");
-    LLAMA_API enum llama_pooling_type llama_pooling_type(const struct llama_context * ctx);
-    LLAMA_API enum llama_vocab_type   llama_vocab_type  (const struct llama_model * model);
-    LLAMA_API enum llama_rope_type    llama_rope_type   (const struct llama_model * model);
+    LLAMA_API const struct llama_model * llama_get_model   (const struct llama_context * ctx);
+    LLAMA_API enum llama_pooling_type    llama_pooling_type(const struct llama_context * ctx);
+    LLAMA_API const struct llama_vocab * llama_model_get_vocab(const struct llama_model * model);
+    LLAMA_API enum llama_rope_type       llama_model_rope_type(const struct llama_model * model);
+    LLAMA_API int32_t llama_model_n_ctx_train(const struct llama_model * model);
+    LLAMA_API int32_t llama_model_n_embd     (const struct llama_model * model);
+    LLAMA_API int32_t llama_model_n_layer    (const struct llama_model * model);
+    LLAMA_API int32_t llama_model_n_head     (const struct llama_model * model);
     // Get the model's RoPE frequency scaling factor
-    LLAMA_API float llama_rope_freq_scale_train(const struct llama_model * model);
+    LLAMA_API float llama_model_rope_freq_scale_train(const struct llama_model * model);
+    LLAMA_API enum llama_vocab_type llama_vocab_type(const struct llama_vocab * vocab);
+    LLAMA_API int32_t llama_vocab_n_tokens(const struct llama_vocab * vocab);
     // Functions to access the model's GGUF metadata scalar values
     // - The functions return the length of the string on success, or -1 on failure
@@ -479,6 +509,9 @@ extern "C" {
     // Returns the total size of all the tensors in the model in bytes
     LLAMA_API uint64_t llama_model_size(const struct llama_model * model);
+    // Get the default chat template. Returns nullptr if not available
+    LLAMA_API const char * llama_model_chat_template(const struct llama_model * model);
     // Returns the total number of parameters in the model
     LLAMA_API uint64_t llama_model_n_params(const struct llama_model * model);
@@ -501,32 +534,36 @@ extern "C" {
             const char * fname_out,
             const llama_model_quantize_params * params);
+    //
+    // Adapters
+    //
     // Load a LoRA adapter from file
-    // The loaded adapter will be associated to the given model, and will be free when the model is deleted
-    LLAMA_API struct llama_lora_adapter * llama_lora_adapter_init(
+    LLAMA_API struct llama_adapter_lora * llama_adapter_lora_init(
             struct llama_model * model,
             const char * path_lora);
+    // Manually free a LoRA adapter
+    // Note: loaded adapters will be free when the associated model is deleted
+    LLAMA_API void llama_adapter_lora_free(struct llama_adapter_lora * adapter);
+    // The following functions operate on a llama_context, hence the naming: llama_verb_...
     // Add a loaded LoRA adapter to given context
     // This will not modify model's weight
-    LLAMA_API int32_t llama_lora_adapter_set(
+    LLAMA_API int32_t llama_set_adapter_lora(
             struct llama_context * ctx,
-            struct llama_lora_adapter * adapter,
+            struct llama_adapter_lora * adapter,
             float scale);
     // Remove a specific LoRA adapter from given context
     // Return -1 if the adapter is not present in the context
-    LLAMA_API int32_t llama_lora_adapter_remove(
+    LLAMA_API int32_t llama_rm_adapter_lora(
             struct llama_context * ctx,
-            struct llama_lora_adapter * adapter);
+            struct llama_adapter_lora * adapter);
     // Remove all LoRA adapters from given context
-    LLAMA_API void llama_lora_adapter_clear(
-            struct llama_context * ctx);
-    // Manually free a LoRA adapter
-    // Note: loaded adapters will be free when the associated model is deleted
-    LLAMA_API void llama_lora_adapter_free(struct llama_lora_adapter * adapter);
+    LLAMA_API void llama_clear_adapter_lora(struct llama_context * ctx);
     // Apply a loaded control vector to a llama_context, or if data is NULL, clear
     // the currently loaded vector.
@@ -534,8 +571,8 @@ extern "C" {
     // to an n_embd x n_layers buffer starting from layer 1.
     // il_start and il_end are the layer range the vector should apply to (both inclusive)
     // See llama_control_vector_load in common to load a control vector.
-    LLAMA_API int32_t llama_control_vector_apply(
-            struct llama_context * lctx,
+    LLAMA_API int32_t llama_apply_adapter_cvec(
+            struct llama_context * ctx,
                      const float * data,
                           size_t   len,
                          int32_t   n_embd,
@@ -546,6 +583,8 @@ extern "C" {
     // KV cache
     //
+    // TODO: remove llama_kv_cache_view_* API
     // Information associated with an individual cell in the KV cache view.
     struct llama_kv_cache_view_cell {
         // The position for this cell. Takes KV cache shifts into account.
@@ -592,8 +631,11 @@ extern "C" {
     LLAMA_API void llama_kv_cache_view_free(struct llama_kv_cache_view * view);
     // Update the KV cache view structure with the current state of the KV cache. (use only for debugging purposes)
+    // TODO: change signature to llama_kv_cache_view_update(struct llama_kv_cache_view * view, const struct llama_context * ctx)
     LLAMA_API void llama_kv_cache_view_update(const struct llama_context * ctx, struct llama_kv_cache_view * view);
+    ///
     // Returns the number of tokens in the KV cache (slow, use only for debug)
     // If a KV cell has multiple sequences assigned to it, it will be counted multiple times
     LLAMA_API int32_t llama_get_kv_cache_token_count(const struct llama_context * ctx);
@@ -663,6 +705,9 @@ extern "C" {
             struct llama_context * ctx,
                     llama_seq_id   seq_id);
+    // TODO: the llama_kv_cache_defrag and llama_kv_cache_update API tightly couples llama_context with llama_kv_cache
+    //       how to avoid this?
     // Defragment the KV cache
     // This will be applied:
     //   - lazily on next llama_decode()
@@ -883,41 +928,60 @@ extern "C" {
     // Vocab
     //
-    LLAMA_API const char * llama_token_get_text(const struct llama_model * model, llama_token token);
+    LLAMA_API const char * llama_vocab_get_text(const struct llama_vocab * vocab, llama_token token);
-    LLAMA_API float llama_token_get_score(const struct llama_model * model, llama_token token);
+    LLAMA_API float llama_vocab_get_score(const struct llama_vocab * vocab, llama_token token);
-    LLAMA_API enum llama_token_attr llama_token_get_attr(const struct llama_model * model, llama_token token);
+    LLAMA_API enum llama_token_attr llama_vocab_get_attr(const struct llama_vocab * vocab, llama_token token);
     // Check if the token is supposed to end generation (end-of-generation, eg. EOS, EOT, etc.)
-    LLAMA_API bool llama_token_is_eog(const struct llama_model * model, llama_token token);
+    LLAMA_API bool llama_vocab_is_eog(const struct llama_vocab * vocab, llama_token token);
     // Identify if Token Id is a control token or a render-able token
-    LLAMA_API bool llama_token_is_control(const struct llama_model * model, llama_token token);
+    LLAMA_API bool llama_vocab_is_control(const struct llama_vocab * vocab, llama_token token);
     // Special tokens
-    LLAMA_API llama_token llama_token_bos(const struct llama_model * model); // beginning-of-sentence
-    LLAMA_API llama_token llama_token_eos(const struct llama_model * model); // end-of-sentence
-    LLAMA_API llama_token llama_token_eot(const struct llama_model * model); // end-of-turn
-    LLAMA_API llama_token llama_token_cls(const struct llama_model * model); // classification
-    LLAMA_API llama_token llama_token_sep(const struct llama_model * model); // sentence separator
-    LLAMA_API llama_token llama_token_nl (const struct llama_model * model); // next-line
-    LLAMA_API llama_token llama_token_pad(const struct llama_model * model); // padding
-    LLAMA_API bool llama_add_bos_token(const struct llama_model * model);
-    LLAMA_API bool llama_add_eos_token(const struct llama_model * model);
-    // infill tokens
-    DEPRECATED(LLAMA_API llama_token llama_token_prefix(const struct llama_model * model), "use llama_token_fim_pre instead");
-    DEPRECATED(LLAMA_API llama_token llama_token_middle(const struct llama_model * model), "use llama_token_fim_mid instead");
-    DEPRECATED(LLAMA_API llama_token llama_token_suffix(const struct llama_model * model), "use llama_token_fim_suf instead");
-    LLAMA_API llama_token llama_token_fim_pre(const struct llama_model * model);
-    LLAMA_API llama_token llama_token_fim_suf(const struct llama_model * model);
-    LLAMA_API llama_token llama_token_fim_mid(const struct llama_model * model);
-    LLAMA_API llama_token llama_token_fim_pad(const struct llama_model * model);
-    LLAMA_API llama_token llama_token_fim_rep(const struct llama_model * model);
-    LLAMA_API llama_token llama_token_fim_sep(const struct llama_model * model);
+    LLAMA_API llama_token llama_vocab_bos(const struct llama_vocab * vocab); // beginning-of-sentence
+    LLAMA_API llama_token llama_vocab_eos(const struct llama_vocab * vocab); // end-of-sentence
+    LLAMA_API llama_token llama_vocab_eot(const struct llama_vocab * vocab); // end-of-turn
+    LLAMA_API llama_token llama_vocab_sep(const struct llama_vocab * vocab); // sentence separator
+    LLAMA_API llama_token llama_vocab_nl (const struct llama_vocab * vocab); // next-line
+    LLAMA_API llama_token llama_vocab_pad(const struct llama_vocab * vocab); // padding
+    LLAMA_API bool llama_vocab_get_add_bos(const struct llama_vocab * vocab);
+    LLAMA_API bool llama_vocab_get_add_eos(const struct llama_vocab * vocab);
+    LLAMA_API llama_token llama_vocab_fim_pre(const struct llama_vocab * vocab);
+    LLAMA_API llama_token llama_vocab_fim_suf(const struct llama_vocab * vocab);
+    LLAMA_API llama_token llama_vocab_fim_mid(const struct llama_vocab * vocab);
+    LLAMA_API llama_token llama_vocab_fim_pad(const struct llama_vocab * vocab);
+    LLAMA_API llama_token llama_vocab_fim_rep(const struct llama_vocab * vocab);
+    LLAMA_API llama_token llama_vocab_fim_sep(const struct llama_vocab * vocab);
+    DEPRECATED(LLAMA_API const char * llama_token_get_text(const struct llama_vocab * vocab, llama_token token), "use llama_vocab_get_text instead");
+    DEPRECATED(LLAMA_API float llama_token_get_score(const struct llama_vocab * vocab, llama_token token), "use llama_vocab_get_score instead");
+    DEPRECATED(LLAMA_API enum llama_token_attr llama_token_get_attr(const struct llama_vocab * vocab, llama_token token), "use llama_vocab_get_attr instead");
+    DEPRECATED(LLAMA_API bool llama_token_is_eog(const struct llama_vocab * vocab, llama_token token), "use llama_vocab_is_eog instead");
+    DEPRECATED(LLAMA_API bool llama_token_is_control(const struct llama_vocab * vocab, llama_token token), "use llama_vocab_is_control instead");
+    DEPRECATED(LLAMA_API llama_token llama_token_bos(const struct llama_vocab * vocab), "use llama_vocab_bos instead");
+    DEPRECATED(LLAMA_API llama_token llama_token_eos(const struct llama_vocab * vocab), "use llama_vocab_eos instead");
+    DEPRECATED(LLAMA_API llama_token llama_token_eot(const struct llama_vocab * vocab), "use llama_vocab_eot instead");
+    DEPRECATED(LLAMA_API llama_token llama_token_cls(const struct llama_vocab * vocab), "use llama_vocab_cls instead");
+    DEPRECATED(LLAMA_API llama_token llama_token_sep(const struct llama_vocab * vocab), "use llama_vocab_sep instead");
+    DEPRECATED(LLAMA_API llama_token llama_token_nl (const struct llama_vocab * vocab), "use llama_vocab_nl instead");
+    DEPRECATED(LLAMA_API llama_token llama_token_pad(const struct llama_vocab * vocab), "use llama_vocab_pad instead");
+    DEPRECATED(LLAMA_API bool llama_add_bos_token(const struct llama_vocab * vocab), "use llama_vocab_get_add_bos instead");
+    DEPRECATED(LLAMA_API bool llama_add_eos_token(const struct llama_vocab * vocab), "use llama_vocab_get_add_eos instead");
+    DEPRECATED(LLAMA_API llama_token llama_token_fim_pre(const struct llama_vocab * vocab), "use llama_vocab_fim_pre instead");
+    DEPRECATED(LLAMA_API llama_token llama_token_fim_suf(const struct llama_vocab * vocab), "use llama_vocab_fim_suf instead");
+    DEPRECATED(LLAMA_API llama_token llama_token_fim_mid(const struct llama_vocab * vocab), "use llama_vocab_fim_mid instead");
+    DEPRECATED(LLAMA_API llama_token llama_token_fim_pad(const struct llama_vocab * vocab), "use llama_vocab_fim_pad instead");
+    DEPRECATED(LLAMA_API llama_token llama_token_fim_rep(const struct llama_vocab * vocab), "use llama_vocab_fim_rep instead");
+    DEPRECATED(LLAMA_API llama_token llama_token_fim_sep(const struct llama_vocab * vocab), "use llama_vocab_fim_sep instead");
+    // CLS is equivalent to BOS
+    DEPRECATED(LLAMA_API llama_token llama_vocab_cls(const struct llama_vocab * vocab), // classification
+            "use llama_vocab_bos instead");
     //
     // Tokenization
@@ -933,7 +997,7 @@ extern "C" {
     /// @param parse_special Allow tokenizing special and/or control tokens which otherwise are not exposed and treated
     ///                      as plaintext. Does not insert a leading space.
     LLAMA_API int32_t llama_tokenize(
-        const struct llama_model * model,
+        const struct llama_vocab * vocab,
                       const char * text,
                          int32_t   text_len,
                      llama_token * tokens,
@@ -947,7 +1011,7 @@ extern "C" {
     // User can skip up to 'lstrip' leading spaces before copying (useful when encoding/decoding multiple tokens with 'add_space_prefix')
     // @param special If true, special tokens are rendered in the output.
     LLAMA_API int32_t llama_token_to_piece(
-              const struct llama_model * model,
+              const struct llama_vocab * vocab,
                            llama_token   token,
                                   char * buf,
                                int32_t   length,
@@ -961,7 +1025,7 @@ extern "C" {
     /// @param remove_special Allow to remove BOS and EOS tokens if model is configured to do so.
     /// @param unparse_special If true, special tokens are rendered in the output.
     LLAMA_API int32_t llama_detokenize(
-        const struct llama_model * model,
+        const struct llama_vocab * vocab,
                const llama_token * tokens,
                          int32_t   n_tokens,
                             char * text,
@@ -984,7 +1048,6 @@ extern "C" {
     /// @param length The size of the allocated buffer
     /// @return The total number of bytes of the formatted prompt. If is it larger than the size of buffer, you may need to re-alloc it and then re-apply the template.
     LLAMA_API int32_t llama_chat_apply_template(
-              const struct llama_model * model,
                             const char * tmpl,
        const struct llama_chat_message * chat,
                                 size_t   n_msg,
@@ -1032,7 +1095,6 @@ extern "C" {
     //    llama_sampler_free(smpl);
     //
     // TODO: In the future, llama_sampler will be utilized to offload the sampling to the backends (e.g. GPU).
-    // TODO: in the future, the entire sampling API that uses llama_model should start using llama_vocab
     //
     typedef void * llama_sampler_context_t;
@@ -1132,7 +1194,7 @@ extern "C" {
                                float   eta);
     LLAMA_API struct llama_sampler * llama_sampler_init_grammar(
-            const struct llama_model * model,
+            const struct llama_vocab * vocab,
                           const char * grammar_str,
                           const char * grammar_root);
@@ -1144,8 +1206,9 @@ extern "C" {
                                float   penalty_present); // 0.0 = disabled
     ///  @details DRY sampler, designed by p-e-w, as described in: https://github.com/oobabooga/text-generation-webui/pull/5677, porting Koboldcpp implementation authored by pi6am: https://github.com/LostRuins/koboldcpp/pull/982
-    LLAMA_API struct llama_sampler *    llama_sampler_init_dry(
-            const struct llama_model *  model,
+    LLAMA_API struct llama_sampler * llama_sampler_init_dry(
+            const struct llama_vocab *  vocab,
+                             int32_t    n_ctx_train,
                                float    dry_multiplier,
                                float    dry_base,
                              int32_t    dry_allowed_length,
@@ -1179,7 +1242,7 @@ extern "C" {
     // 3. discard non-EOG tokens with low prob
     // 4. if no tokens are left -> pick EOT
     //
-    LLAMA_API struct llama_sampler * llama_sampler_init_infill(const struct llama_model * model);
+    LLAMA_API struct llama_sampler * llama_sampler_init_infill(const struct llama_vocab * vocab);
     // Returns the seed used by the sampler if applicable, LLAMA_DEFAULT_SEED otherwise
     LLAMA_API uint32_t llama_sampler_get_seed(const struct llama_sampler * smpl);

package/src/llama.cpp/src/CMakeLists.txt CHANGED Viewed

@@ -9,9 +9,21 @@ llama_add_compile_flags()
 add_library(llama
             ../include/llama.h
             llama.cpp
-            llama-vocab.cpp
+            llama-adapter.cpp
+            llama-arch.cpp
+            llama-batch.cpp
+            llama-chat.cpp
+            llama-context.cpp
             llama-grammar.cpp
+            llama-hparams.cpp
+            llama-impl.cpp
+            llama-kv-cache.cpp
+            llama-mmap.cpp
+            llama-model-loader.cpp
+            llama-model.cpp
+            llama-quant.cpp
             llama-sampling.cpp
+            llama-vocab.cpp
             unicode.h
             unicode.cpp
             unicode-data.cpp