npm - cui-llama.rn - Versions diffs - 1.3.5 → 1.3.6 - Mend

cui-llama.rn 1.3.5 → 1.3.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (52) hide show

package/android/src/main/CMakeLists.txt +14 -8
package/android/src/main/jni.cpp +38 -37
package/cpp/common.cpp +43 -26
package/cpp/common.h +18 -11
package/cpp/ggml-backend-reg.cpp +5 -0
package/cpp/ggml-backend.cpp +5 -2
package/cpp/ggml-cpp.h +1 -0
package/cpp/ggml-cpu-aarch64.cpp +6 -1
package/cpp/ggml-cpu-quants.c +5 -1
package/cpp/ggml-impl.h +11 -16
package/cpp/ggml-metal.m +2 -2
package/cpp/ggml.c +0 -1276
package/cpp/ggml.h +0 -140
package/cpp/gguf.cpp +1325 -0
package/cpp/gguf.h +202 -0
package/cpp/llama-adapter.cpp +346 -0
package/cpp/llama-adapter.h +73 -0
package/cpp/llama-arch.cpp +1434 -0
package/cpp/llama-arch.h +395 -0
package/cpp/llama-batch.cpp +368 -0
package/cpp/llama-batch.h +88 -0
package/cpp/llama-chat.cpp +567 -0
package/cpp/llama-chat.h +51 -0
package/cpp/llama-context.cpp +1771 -0
package/cpp/llama-context.h +128 -0
package/cpp/llama-cparams.cpp +1 -0
package/cpp/llama-cparams.h +37 -0
package/cpp/llama-cpp.h +30 -0
package/cpp/llama-grammar.cpp +1 -0
package/cpp/llama-grammar.h +3 -1
package/cpp/llama-hparams.cpp +71 -0
package/cpp/llama-hparams.h +140 -0
package/cpp/llama-impl.cpp +167 -0
package/cpp/llama-impl.h +16 -136
package/cpp/llama-kv-cache.cpp +718 -0
package/cpp/llama-kv-cache.h +218 -0
package/cpp/llama-mmap.cpp +589 -0
package/cpp/llama-mmap.h +67 -0
package/cpp/llama-model-loader.cpp +1011 -0
package/cpp/llama-model-loader.h +158 -0
package/cpp/llama-model.cpp +2202 -0
package/cpp/llama-model.h +391 -0
package/cpp/llama-sampling.cpp +117 -4
package/cpp/llama-vocab.cpp +21 -28
package/cpp/llama-vocab.h +13 -1
package/cpp/llama.cpp +8437 -19421
package/cpp/llama.cpp.rej +23 -0
package/cpp/llama.h +31 -6
package/cpp/rn-llama.hpp +39 -37
package/cpp/sgemm.cpp +776 -70
package/cpp/unicode.cpp +6 -0
package/package.json +1 -1

package/cpp/llama.cpp.rej ADDED Viewed

@@ -0,0 +1,23 @@
+--- llama.cpp.orig	2024-11-02 12:42:13
++++ llama.cpp	2024-11-02 13:00:37
+@@ -1941,16 +1952,16 @@
+         if (prefetch > 0) {
+             // advise the kernel to preload the mapped memory
+-            if (posix_madvise(addr, std::min(file->size, prefetch), POSIX_MADV_WILLNEED)) {
+-                LLAMA_LOG_WARN("warning: posix_madvise(.., POSIX_MADV_WILLNEED) failed: %s\n",
++            if (madvise(addr, std::min(file->size, prefetch), MADV_WILLNEED)) {
++                fprintf(stderr, "warning: madvise(.., MADV_WILLNEED) failed: %s\n",
+                         strerror(errno));
+             }
+         }
+         if (numa) {
+             // advise the kernel not to use readahead
+             // (because the next page might not belong on the same node)
+-            if (posix_madvise(addr, file->size, POSIX_MADV_RANDOM)) {
+-                LLAMA_LOG_WARN("warning: posix_madvise(.., POSIX_MADV_RANDOM) failed: %s\n",
++            if (madvise(addr, file->size, MADV_RANDOM)) {
++                fprintf(stderr, "warning: madvise(.., MADV_RANDOM) failed: %s\n",
+                         strerror(errno));
+             }
+         }

package/cpp/llama.h CHANGED Viewed

@@ -35,7 +35,6 @@
 #define LLAMA_DEFAULT_SEED 0xFFFFFFFF
-// TODO: use everywhere in the implementation
 #define LLAMA_TOKEN_NULL -1
 #define LLAMA_FILE_MAGIC_GGLA 0x67676c61u // 'ggla'
@@ -106,6 +105,7 @@ extern "C" {
         LLAMA_VOCAB_PRE_TYPE_EXAONE         = 25,
         LLAMA_VOCAB_PRE_TYPE_CHAMELEON      = 26,
         LLAMA_VOCAB_PRE_TYPE_MINERVA        = 27,
+        LLAMA_VOCAB_PRE_TYPE_DEEPSEEK3_LLM  = 28,
     };
     enum llama_rope_type {
@@ -386,6 +386,7 @@ extern "C" {
     } llama_chat_message;
     // lora adapter
+    // TODO: rename to llama_adapter_lora
     struct llama_lora_adapter;
     // Helpers for getting default parameters
@@ -413,11 +414,19 @@ extern "C" {
     // Call once at the end of the program - currently only used for MPI
     LLAMA_API void llama_backend_free(void);
-    LLAMA_API struct llama_model * llama_load_model_from_file(
+    DEPRECATED(LLAMA_API struct llama_model * llama_load_model_from_file(
+                             const char * path_model,
+              struct llama_model_params   params),
+            "use llama_model_load_from_file instead");
+    LLAMA_API struct llama_model * llama_model_load_from_file(
                              const char * path_model,
               struct llama_model_params   params);
-    LLAMA_API void llama_free_model(struct llama_model * model);
+    DEPRECATED(LLAMA_API void llama_free_model(struct llama_model * model),
+            "use llama_model_free instead");
+    LLAMA_API void llama_model_free(struct llama_model * model);
     // TODO: rename to llama_init_from_model
     LLAMA_API struct llama_context * llama_new_context_with_model(
@@ -502,14 +511,19 @@ extern "C" {
             const char * fname_out,
             const llama_model_quantize_params * params);
+    //
+    // Adapters
+    //
     // Load a LoRA adapter from file
-    // The loaded adapter will be associated to the given model, and will be free when the model is deleted
+    // TODO: rename to llama_adapter_lora_init
     LLAMA_API struct llama_lora_adapter * llama_lora_adapter_init(
             struct llama_model * model,
             const char * path_lora);
     // Add a loaded LoRA adapter to given context
     // This will not modify model's weight
+    // TODO: rename to llama_set_adapter_lora
     LLAMA_API int32_t llama_lora_adapter_set(
             struct llama_context * ctx,
             struct llama_lora_adapter * adapter,
@@ -517,16 +531,18 @@ extern "C" {
     // Remove a specific LoRA adapter from given context
     // Return -1 if the adapter is not present in the context
+    // TODO: rename to llama_rm_adapter_lora
     LLAMA_API int32_t llama_lora_adapter_remove(
             struct llama_context * ctx,
             struct llama_lora_adapter * adapter);
     // Remove all LoRA adapters from given context
-    LLAMA_API void llama_lora_adapter_clear(
-            struct llama_context * ctx);
+    // TODO: rename to llama_clear_adapter_lora
+    LLAMA_API void llama_lora_adapter_clear(struct llama_context * ctx);
     // Manually free a LoRA adapter
     // Note: loaded adapters will be free when the associated model is deleted
+    // TODO: rename to llama_adapter_lora_free
     LLAMA_API void llama_lora_adapter_free(struct llama_lora_adapter * adapter);
     // Apply a loaded control vector to a llama_context, or if data is NULL, clear
@@ -535,6 +551,7 @@ extern "C" {
     // to an n_embd x n_layers buffer starting from layer 1.
     // il_start and il_end are the layer range the vector should apply to (both inclusive)
     // See llama_control_vector_load in common to load a control vector.
+    // TODO: rename to llama_adapter_cvec_apply
     LLAMA_API int32_t llama_control_vector_apply(
             struct llama_context * lctx,
                      const float * data,
@@ -547,6 +564,8 @@ extern "C" {
     // KV cache
     //
+    // TODO: remove llama_kv_cache_view_* API
     // Information associated with an individual cell in the KV cache view.
     struct llama_kv_cache_view_cell {
         // The position for this cell. Takes KV cache shifts into account.
@@ -593,8 +612,11 @@ extern "C" {
     LLAMA_API void llama_kv_cache_view_free(struct llama_kv_cache_view * view);
     // Update the KV cache view structure with the current state of the KV cache. (use only for debugging purposes)
+    // TODO: change signature to llama_kv_cache_view_update(struct llama_kv_cache_view * view, const struct llama_context * ctx)
     LLAMA_API void llama_kv_cache_view_update(const struct llama_context * ctx, struct llama_kv_cache_view * view);
+    ///
     // Returns the number of tokens in the KV cache (slow, use only for debug)
     // If a KV cell has multiple sequences assigned to it, it will be counted multiple times
     LLAMA_API int32_t llama_get_kv_cache_token_count(const struct llama_context * ctx);
@@ -664,6 +686,9 @@ extern "C" {
             struct llama_context * ctx,
                     llama_seq_id   seq_id);
+    // TODO: the llama_kv_cache_defrag and llama_kv_cache_update API tightly couples llama_context with llama_kv_cache
+    //       how to avoid this?
     // Defragment the KV cache
     // This will be applied:
     //   - lazily on next llama_decode()

package/cpp/rn-llama.hpp CHANGED Viewed

@@ -8,6 +8,7 @@
 #include "llama.h"
 #include "llama-impl.h"
 #include "sampling.h"
+#include "llama-cpp.h"
 namespace rnllama {
@@ -187,7 +188,7 @@ static std::string tokens_to_output_formatted_string(const llama_context *ctx, c
 }
 template <class Iter>
-static std::string tokens_to_str(llama_context *ctx, Iter begin, Iter end)
+static std::string tokens_to_str(llama_context* ctx, Iter begin, Iter end)
 {
     std::string ret;
     for (; begin != end; ++begin)
@@ -214,11 +215,11 @@ struct llama_rn_context
     common_params params;
-    llama_model *model = nullptr;
+    llama_model_ptr model = nullptr;
     float loading_progress = 0;
     bool is_load_interrupted = false;
-    llama_context *ctx = nullptr;
+    llama_context_ptr ctx = nullptr;
     common_sampler *ctx_sampling = nullptr;
     int n_ctx;
@@ -234,12 +235,12 @@ struct llama_rn_context
     {
         if (ctx)
         {
-            llama_free(ctx);
+            llama_free(ctx.get());
             ctx = nullptr;
         }
         if (model)
         {
-            llama_free_model(model);
+            llama_model_free(model.get());
             model = nullptr;
         }
         if (ctx_sampling != nullptr)
@@ -273,7 +274,7 @@ struct llama_rn_context
         if (ctx_sampling != nullptr) {
             common_sampler_free(ctx_sampling);
         }
-        ctx_sampling = common_sampler_init(model, params.sampling);
+        ctx_sampling = common_sampler_init(model.get(), params.sampling);
         return ctx_sampling != nullptr;
     }
@@ -281,26 +282,26 @@ struct llama_rn_context
     {
         params = params_;
         common_init_result result = common_init_from_params(params);
-        model = result.model;
-        ctx = result.context;
+        model = std::move(result.model);
+        ctx = std::move(result.context);
         if (model == nullptr)
         {
            LOG_ERROR("unable to load model: %s", params_.model.c_str());
            return false;
         }
         LOG_VERBOSE("getting n_ctx");
-        n_ctx = llama_n_ctx(ctx);
+        n_ctx = llama_n_ctx(ctx.get());
         return true;
     }
     bool validateModelChatTemplate() const {
         std::vector<char> model_template(2048, 0); // longest known template is about 1200 bytes
         std::string template_key = "tokenizer.chat_template";
-        int32_t res = llama_model_meta_val_str(model, template_key.c_str(), model_template.data(), model_template.size());
+        int32_t res = llama_model_meta_val_str(model.get(), template_key.c_str(), model_template.data(), model_template.size());
         if (res >= 0) {
             llama_chat_message chat[] = {{"user", "test"}};
             std::string tmpl = std::string(model_template.data(), model_template.size());
-            int32_t chat_res = llama_chat_apply_template(model, tmpl.c_str(), chat, 1, true, nullptr, 0);
+            int32_t chat_res = llama_chat_apply_template(model.get(), tmpl.c_str(), chat, 1, true, nullptr, 0);
             return chat_res > 0;
         }
         return res > 0;
@@ -330,7 +331,7 @@ struct llama_rn_context
     void loadPrompt()
     {
-        std::vector<llama_token> prompt_tokens = ::common_tokenize(ctx, params.prompt, true, true);
+        std::vector<llama_token> prompt_tokens = ::common_tokenize(model.get(), params.prompt, true, true);
         num_prompt_tokens = prompt_tokens.size();
         // LOG tokens
@@ -358,7 +359,7 @@ struct llama_rn_context
         // do Context Shift , may be buggy! TODO: Verify functionality
         if(!params.embedding){
-            purge_missing_tokens(ctx, embd, prompt_tokens, params.n_predict, params.n_ctx);
+            purge_missing_tokens(ctx.get(), embd, prompt_tokens, params.n_predict, params.n_ctx);
         }
         // push the prompt into the sampling context (do not apply grammar)
@@ -379,7 +380,7 @@ struct llama_rn_context
         }
         // since #3228 we now have to manually manage the KV cache
-        llama_kv_cache_seq_rm(ctx, 0, n_past, -1);
+        llama_kv_cache_seq_rm(ctx.get(), 0, n_past, -1);
         LOG_VERBOSE("prompt ingested, n_past: %d, cached: %s, to_eval: %s",
             n_past,
@@ -394,7 +395,7 @@ struct llama_rn_context
     {
         // number of tokens to keep when resetting context
         n_remain = params.n_predict;
-        llama_perf_context_reset(ctx);
+        llama_perf_context_reset(ctx.get());
         is_predicting = true;
     }
@@ -410,8 +411,8 @@ struct llama_rn_context
             const int n_left    = n_past - params.n_keep - 1;
             const int n_discard = n_left/2;
-            llama_kv_cache_seq_rm (ctx, 0, params.n_keep + 1            , params.n_keep + n_discard + 1);
-            llama_kv_cache_seq_add(ctx, 0, params.n_keep + 1 + n_discard, n_past, -n_discard);
+            llama_kv_cache_seq_rm (ctx.get(), 0, params.n_keep + 1            , params.n_keep + n_discard + 1);
+            llama_kv_cache_seq_add(ctx.get(), 0, params.n_keep + 1 + n_discard, n_past, -n_discard);
             for (size_t i = params.n_keep + 1 + n_discard; i < embd.size(); i++)
             {
@@ -437,13 +438,14 @@ struct llama_rn_context
             {
                 n_eval = params.n_batch;
             }
-            if (llama_decode(ctx, llama_batch_get_one(&embd[n_past], n_eval)))
+            if (llama_decode(ctx.get(), llama_batch_get_one(&embd[n_past], n_eval)))
             {
                 LOG_ERROR("failed to eval, n_eval: %d, n_past: %d, n_threads: %d, embd: %s",
                     n_eval,
                     n_past,
                     params.cpuparams.n_threads,
-                    tokens_to_str(ctx, embd.cbegin() + n_past, embd.cend()).c_str()
+                    tokens_to_str(ctx.get(), embd.cbegin() + n_past, embd.cend()).c_str()
                 );
                 has_next_token = false;
                 return result;
@@ -461,16 +463,16 @@ struct llama_rn_context
         if (params.n_predict == 0)
         {
             has_next_token = false;
-            result.tok = llama_token_eos(model);
+            result.tok = llama_token_eos(model.get());
             return result;
         }
         {
             // out of user input, sample next token
             std::vector<llama_token_data> candidates;
-            candidates.reserve(llama_n_vocab(model));
+            candidates.reserve(llama_n_vocab(model.get()));
-            result.tok = common_sampler_sample(ctx_sampling, ctx, -1);
+            result.tok = common_sampler_sample(ctx_sampling, ctx.get(), -1);
             llama_token_data_array cur_p = *common_sampler_get_candidates(ctx_sampling);
@@ -501,7 +503,7 @@ struct llama_rn_context
         // decrement remaining sampling budget
         --n_remain;
-        if (!embd.empty() && embd.back() == llama_token_eos(model))
+        if (!embd.empty() && embd.back() == llama_token_eos(model.get()))
         {
             // stopping_word = llama_token_to_piece(ctx, embd.back());
             has_next_token = false;
@@ -550,7 +552,7 @@ struct llama_rn_context
     {
         const completion_token_output token_with_probs = nextToken();
-        const std::string token_text = token_with_probs.tok == -1 ? "" : common_token_to_piece(ctx, token_with_probs.tok);
+        const std::string token_text = token_with_probs.tok == -1 ? "" : common_token_to_piece(ctx.get(), token_with_probs.tok);
         generated_text += token_text;
         if (params.sampling.n_probs > 0)
@@ -606,7 +608,7 @@ struct llama_rn_context
     std::vector<float> getEmbedding(common_params &embd_params)
     {
-        static const int n_embd = llama_n_embd(llama_get_model(ctx));
+        static const int n_embd = llama_n_embd(llama_get_model(ctx.get()));
         if (!embd_params.embedding)
         {
             LOG_WARNING("embedding disabled, embedding: %s", embd_params.embedding);
@@ -614,12 +616,12 @@ struct llama_rn_context
         }
         float *data;
-        const enum llama_pooling_type pooling_type = llama_pooling_type(ctx);
+        const enum llama_pooling_type pooling_type = llama_pooling_type(ctx.get());
         printf("pooling_type: %d\n", pooling_type);
         if (pooling_type == LLAMA_POOLING_TYPE_NONE) {
-            data = llama_get_embeddings(ctx);
+            data = llama_get_embeddings(ctx.get());
         } else {
-            data = llama_get_embeddings_seq(ctx, 0);
+            data = llama_get_embeddings_seq(ctx.get(), 0);
         }
         if (!data) {
@@ -661,15 +663,15 @@ struct llama_rn_context
             }
             batch.logits[batch.n_tokens - 1] = 1; // true
-            llama_kv_cache_clear(ctx);
+            llama_kv_cache_clear(ctx.get());
             const int64_t t_pp_start = llama_time_us();
-            if (llama_decode(ctx, batch) != 0)
+            if (llama_decode(ctx.get(), batch) != 0)
             {
                 LOG_ERROR("llama_decode() failed during prompt", "");
             }
             const int64_t t_pp_end = llama_time_us();
-            llama_kv_cache_clear(ctx);
+            llama_kv_cache_clear(ctx.get());
             if (is_interrupted) break;
@@ -684,7 +686,7 @@ struct llama_rn_context
                     llama_batch_add(&batch, 0, i, {j}, true);
                 }
-                if (llama_decode(ctx, batch) != 0)
+                if (llama_decode(ctx.get(), batch) != 0)
                 {
                     LOG_ERROR("llama_decode() failed during text generation", "");
                 }
@@ -693,7 +695,7 @@ struct llama_rn_context
             const int64_t t_tg_end = llama_time_us();
-            llama_kv_cache_clear(ctx);
+            llama_kv_cache_clear(ctx.get());
             const double t_pp = (t_pp_end - t_pp_start) / 1000000.0;
             const double t_tg = (t_tg_end - t_tg_start) / 1000000.0;
@@ -719,14 +721,14 @@ struct llama_rn_context
             tg_std = 0;
         }
-        if (is_interrupted) llama_kv_cache_clear(ctx);
+        if (is_interrupted) llama_kv_cache_clear(ctx.get());
         is_predicting = false;
         char model_desc[128];
-        llama_model_desc(model, model_desc, sizeof(model_desc));
+        llama_model_desc(model.get(), model_desc, sizeof(model_desc));
         return std::string("[\"") + model_desc + std::string("\",") +
-            std::to_string(llama_model_size(model)) + std::string(",") +
-            std::to_string(llama_model_n_params(model)) + std::string(",") +
+            std::to_string(llama_model_size(model.get())) + std::string(",") +
+            std::to_string(llama_model_n_params(model.get())) + std::string(",") +
             std::to_string(pp_avg) + std::string(",") +
             std::to_string(pp_std) + std::string(",") +
             std::to_string(tg_avg) + std::string(",") +