npm - cui-llama.rn - Versions diffs - 1.3.5 → 1.4.0 - Mend

cui-llama.rn 1.3.5 → 1.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (80) hide show

package/README.md +22 -1
package/android/src/main/CMakeLists.txt +25 -20
package/android/src/main/java/com/rnllama/LlamaContext.java +31 -9
package/android/src/main/java/com/rnllama/RNLlama.java +98 -0
package/android/src/main/jni-utils.h +94 -0
package/android/src/main/jni.cpp +108 -37
package/android/src/newarch/java/com/rnllama/RNLlamaModule.java +15 -0
package/android/src/oldarch/java/com/rnllama/RNLlamaModule.java +15 -0
package/cpp/common.cpp +1982 -1965
package/cpp/common.h +665 -657
package/cpp/ggml-backend-reg.cpp +5 -0
package/cpp/ggml-backend.cpp +5 -2
package/cpp/ggml-cpp.h +1 -0
package/cpp/ggml-cpu-aarch64.cpp +6 -1
package/cpp/ggml-cpu-quants.c +5 -1
package/cpp/ggml-cpu.c +14122 -14122
package/cpp/ggml-cpu.cpp +627 -627
package/cpp/ggml-impl.h +11 -16
package/cpp/ggml-metal-impl.h +288 -0
package/cpp/ggml-metal.m +2 -2
package/cpp/ggml-opt.cpp +854 -0
package/cpp/ggml-opt.h +216 -0
package/cpp/ggml.c +0 -1276
package/cpp/ggml.h +0 -140
package/cpp/gguf.cpp +1325 -0
package/cpp/gguf.h +202 -0
package/cpp/llama-adapter.cpp +346 -0
package/cpp/llama-adapter.h +73 -0
package/cpp/llama-arch.cpp +1434 -0
package/cpp/llama-arch.h +395 -0
package/cpp/llama-batch.cpp +368 -0
package/cpp/llama-batch.h +88 -0
package/cpp/llama-chat.cpp +567 -0
package/cpp/llama-chat.h +51 -0
package/cpp/llama-context.cpp +1771 -0
package/cpp/llama-context.h +128 -0
package/cpp/llama-cparams.cpp +1 -0
package/cpp/llama-cparams.h +37 -0
package/cpp/llama-cpp.h +30 -0
package/cpp/llama-grammar.cpp +1 -0
package/cpp/llama-grammar.h +3 -1
package/cpp/llama-hparams.cpp +71 -0
package/cpp/llama-hparams.h +140 -0
package/cpp/llama-impl.cpp +167 -0
package/cpp/llama-impl.h +16 -136
package/cpp/llama-kv-cache.cpp +718 -0
package/cpp/llama-kv-cache.h +218 -0
package/cpp/llama-mmap.cpp +589 -0
package/cpp/llama-mmap.h +67 -0
package/cpp/llama-model-loader.cpp +1011 -0
package/cpp/llama-model-loader.h +158 -0
package/cpp/llama-model.cpp +2202 -0
package/cpp/llama-model.h +391 -0
package/cpp/llama-sampling.cpp +117 -4
package/cpp/llama-vocab.cpp +21 -28
package/cpp/llama-vocab.h +13 -1
package/cpp/llama.cpp +12547 -23528
package/cpp/llama.h +31 -6
package/cpp/rn-llama.hpp +90 -87
package/cpp/sgemm.cpp +776 -70
package/cpp/sgemm.h +14 -14
package/cpp/unicode.cpp +6 -0
package/ios/RNLlama.mm +47 -0
package/ios/RNLlamaContext.h +3 -1
package/ios/RNLlamaContext.mm +71 -14
package/jest/mock.js +15 -3
package/lib/commonjs/NativeRNLlama.js.map +1 -1
package/lib/commonjs/index.js +33 -37
package/lib/commonjs/index.js.map +1 -1
package/lib/module/NativeRNLlama.js.map +1 -1
package/lib/module/index.js +31 -35
package/lib/module/index.js.map +1 -1
package/lib/typescript/NativeRNLlama.d.ts +26 -6
package/lib/typescript/NativeRNLlama.d.ts.map +1 -1
package/lib/typescript/index.d.ts +21 -36
package/lib/typescript/index.d.ts.map +1 -1
package/llama-rn.podspec +4 -18
package/package.json +2 -3
package/src/NativeRNLlama.ts +32 -13
package/src/index.ts +52 -47

package/cpp/llama.h CHANGED Viewed

@@ -35,7 +35,6 @@
 #define LLAMA_DEFAULT_SEED 0xFFFFFFFF
-// TODO: use everywhere in the implementation
 #define LLAMA_TOKEN_NULL -1
 #define LLAMA_FILE_MAGIC_GGLA 0x67676c61u // 'ggla'
@@ -106,6 +105,7 @@ extern "C" {
         LLAMA_VOCAB_PRE_TYPE_EXAONE         = 25,
         LLAMA_VOCAB_PRE_TYPE_CHAMELEON      = 26,
         LLAMA_VOCAB_PRE_TYPE_MINERVA        = 27,
+        LLAMA_VOCAB_PRE_TYPE_DEEPSEEK3_LLM  = 28,
     };
     enum llama_rope_type {
@@ -386,6 +386,7 @@ extern "C" {
     } llama_chat_message;
     // lora adapter
+    // TODO: rename to llama_adapter_lora
     struct llama_lora_adapter;
     // Helpers for getting default parameters
@@ -413,11 +414,19 @@ extern "C" {
     // Call once at the end of the program - currently only used for MPI
     LLAMA_API void llama_backend_free(void);
-    LLAMA_API struct llama_model * llama_load_model_from_file(
+    DEPRECATED(LLAMA_API struct llama_model * llama_load_model_from_file(
+                             const char * path_model,
+              struct llama_model_params   params),
+            "use llama_model_load_from_file instead");
+    LLAMA_API struct llama_model * llama_model_load_from_file(
                              const char * path_model,
               struct llama_model_params   params);
-    LLAMA_API void llama_free_model(struct llama_model * model);
+    DEPRECATED(LLAMA_API void llama_free_model(struct llama_model * model),
+            "use llama_model_free instead");
+    LLAMA_API void llama_model_free(struct llama_model * model);
     // TODO: rename to llama_init_from_model
     LLAMA_API struct llama_context * llama_new_context_with_model(
@@ -502,14 +511,19 @@ extern "C" {
             const char * fname_out,
             const llama_model_quantize_params * params);
+    //
+    // Adapters
+    //
     // Load a LoRA adapter from file
-    // The loaded adapter will be associated to the given model, and will be free when the model is deleted
+    // TODO: rename to llama_adapter_lora_init
     LLAMA_API struct llama_lora_adapter * llama_lora_adapter_init(
             struct llama_model * model,
             const char * path_lora);
     // Add a loaded LoRA adapter to given context
     // This will not modify model's weight
+    // TODO: rename to llama_set_adapter_lora
     LLAMA_API int32_t llama_lora_adapter_set(
             struct llama_context * ctx,
             struct llama_lora_adapter * adapter,
@@ -517,16 +531,18 @@ extern "C" {
     // Remove a specific LoRA adapter from given context
     // Return -1 if the adapter is not present in the context
+    // TODO: rename to llama_rm_adapter_lora
     LLAMA_API int32_t llama_lora_adapter_remove(
             struct llama_context * ctx,
             struct llama_lora_adapter * adapter);
     // Remove all LoRA adapters from given context
-    LLAMA_API void llama_lora_adapter_clear(
-            struct llama_context * ctx);
+    // TODO: rename to llama_clear_adapter_lora
+    LLAMA_API void llama_lora_adapter_clear(struct llama_context * ctx);
     // Manually free a LoRA adapter
     // Note: loaded adapters will be free when the associated model is deleted
+    // TODO: rename to llama_adapter_lora_free
     LLAMA_API void llama_lora_adapter_free(struct llama_lora_adapter * adapter);
     // Apply a loaded control vector to a llama_context, or if data is NULL, clear
@@ -535,6 +551,7 @@ extern "C" {
     // to an n_embd x n_layers buffer starting from layer 1.
     // il_start and il_end are the layer range the vector should apply to (both inclusive)
     // See llama_control_vector_load in common to load a control vector.
+    // TODO: rename to llama_adapter_cvec_apply
     LLAMA_API int32_t llama_control_vector_apply(
             struct llama_context * lctx,
                      const float * data,
@@ -547,6 +564,8 @@ extern "C" {
     // KV cache
     //
+    // TODO: remove llama_kv_cache_view_* API
     // Information associated with an individual cell in the KV cache view.
     struct llama_kv_cache_view_cell {
         // The position for this cell. Takes KV cache shifts into account.
@@ -593,8 +612,11 @@ extern "C" {
     LLAMA_API void llama_kv_cache_view_free(struct llama_kv_cache_view * view);
     // Update the KV cache view structure with the current state of the KV cache. (use only for debugging purposes)
+    // TODO: change signature to llama_kv_cache_view_update(struct llama_kv_cache_view * view, const struct llama_context * ctx)
     LLAMA_API void llama_kv_cache_view_update(const struct llama_context * ctx, struct llama_kv_cache_view * view);
+    ///
     // Returns the number of tokens in the KV cache (slow, use only for debug)
     // If a KV cell has multiple sequences assigned to it, it will be counted multiple times
     LLAMA_API int32_t llama_get_kv_cache_token_count(const struct llama_context * ctx);
@@ -664,6 +686,9 @@ extern "C" {
             struct llama_context * ctx,
                     llama_seq_id   seq_id);
+    // TODO: the llama_kv_cache_defrag and llama_kv_cache_update API tightly couples llama_context with llama_kv_cache
+    //       how to avoid this?
     // Defragment the KV cache
     // This will be applied:
     //   - lazily on next llama_decode()

package/cpp/rn-llama.hpp CHANGED Viewed

@@ -5,64 +5,35 @@
 #include <iostream>
 #include "common.h"
 #include "ggml.h"
+#include "gguf.h"
 #include "llama.h"
 #include "llama-impl.h"
 #include "sampling.h"
+#if defined(__ANDROID__)
+#include <android/log.h>
+#endif
 namespace rnllama {
-static std::string lm_gguf_data_to_str(enum lm_gguf_type type, const void * data, int i) {
-    switch (type) {
-        case LM_GGUF_TYPE_UINT8:   return std::to_string(((const uint8_t  *)data)[i]);
-        case LM_GGUF_TYPE_INT8:    return std::to_string(((const int8_t   *)data)[i]);
-        case LM_GGUF_TYPE_UINT16:  return std::to_string(((const uint16_t *)data)[i]);
-        case LM_GGUF_TYPE_INT16:   return std::to_string(((const int16_t  *)data)[i]);
-        case LM_GGUF_TYPE_UINT32:  return std::to_string(((const uint32_t *)data)[i]);
-        case LM_GGUF_TYPE_INT32:   return std::to_string(((const int32_t  *)data)[i]);
-        case LM_GGUF_TYPE_UINT64:  return std::to_string(((const uint64_t *)data)[i]);
-        case LM_GGUF_TYPE_INT64:   return std::to_string(((const int64_t  *)data)[i]);
-        case LM_GGUF_TYPE_FLOAT32: return std::to_string(((const float    *)data)[i]);
-        case LM_GGUF_TYPE_FLOAT64: return std::to_string(((const double   *)data)[i]);
-        case LM_GGUF_TYPE_BOOL:    return ((const bool *)data)[i] ? "true" : "false";
-        default:                   return "unknown type: " + std::to_string(type);
-    }
-}
-static std::string lm_gguf_kv_to_str(const struct lm_gguf_context * ctx_gguf, int i) {
-    const enum lm_gguf_type type = lm_gguf_get_kv_type(ctx_gguf, i);
+const std::vector<lm_ggml_type> kv_cache_types = {
+    LM_GGML_TYPE_F32,
+    LM_GGML_TYPE_F16,
+    LM_GGML_TYPE_BF16,
+    LM_GGML_TYPE_Q8_0,
+    LM_GGML_TYPE_Q4_0,
+    LM_GGML_TYPE_Q4_1,
+    LM_GGML_TYPE_IQ4_NL,
+    LM_GGML_TYPE_Q5_0,
+    LM_GGML_TYPE_Q5_1,
+};
-    switch (type) {
-        case LM_GGUF_TYPE_STRING:
-            return lm_gguf_get_val_str(ctx_gguf, i);
-        case LM_GGUF_TYPE_ARRAY:
-            {
-                const enum lm_gguf_type arr_type = lm_gguf_get_arr_type(ctx_gguf, i);
-                int arr_n = lm_gguf_get_arr_n(ctx_gguf, i);
-                const void * data = lm_gguf_get_arr_data(ctx_gguf, i);
-                std::stringstream ss;
-                ss << "[";
-                for (int j = 0; j < arr_n; j++) {
-                    if (arr_type == LM_GGUF_TYPE_STRING) {
-                        std::string val = lm_gguf_get_arr_str(ctx_gguf, i, j);
-                        // escape quotes
-                        replace_all(val, "\\", "\\\\");
-                        replace_all(val, "\"", "\\\"");
-                        ss << '"' << val << '"';
-                    } else if (arr_type == LM_GGUF_TYPE_ARRAY) {
-                        ss << "???";
-                    } else {
-                        ss << lm_gguf_data_to_str(arr_type, data, j);
-                    }
-                    if (j < arr_n - 1) {
-                        ss << ", ";
-                    }
-                }
-                ss << "]";
-                return ss.str();
-            }
-        default:
-            return lm_gguf_data_to_str(type, lm_gguf_get_val_data(ctx_gguf, i), 0);
+static lm_ggml_type kv_cache_type_from_str(const std::string & s) {
+    for (const auto & type : kv_cache_types) {
+        if (lm_ggml_type_name(type) == s) {
+            return type;
+        }
     }
+    throw std::runtime_error("Unsupported cache type: " + s);
 }
 static void llama_batch_clear(llama_batch *batch) {
@@ -86,16 +57,32 @@ static void llama_batch_add(llama_batch *batch, llama_token id, llama_pos pos, s
 static void log(const char *level, const char *function, int line,
                        const char *format, ...)
 {
-    printf("[%s] %s:%d ", level, function, line);
     va_list args;
-    va_start(args, format);
-    vprintf(format, args);
-    va_end(args);
-    printf("\n");
+    #if defined(__ANDROID__)
+        char prefix[256];
+        snprintf(prefix, sizeof(prefix), "%s:%d %s", function, line, format);
+        va_start(args, format);
+        android_LogPriority priority;
+        if (strcmp(level, "ERROR") == 0) {
+            priority = ANDROID_LOG_ERROR;
+        } else if (strcmp(level, "WARNING") == 0) {
+            priority = ANDROID_LOG_WARN;
+        } else if (strcmp(level, "INFO") == 0) {
+            priority = ANDROID_LOG_INFO;
+        } else {
+            priority = ANDROID_LOG_DEBUG;
+        }
+        __android_log_vprint(priority, "RNLlama", prefix, args);
+        va_end(args);
+    #else
+        printf("[%s] %s:%d ", level, function, line);
+        va_start(args, format);
+        vprintf(format, args);
+        va_end(args);
+        printf("\n");
+    #endif
 }
 static bool rnllama_verbose = false;
 #if RNLLAMA_VERBOSE != 1
@@ -187,7 +174,7 @@ static std::string tokens_to_output_formatted_string(const llama_context *ctx, c
 }
 template <class Iter>
-static std::string tokens_to_str(llama_context *ctx, Iter begin, Iter end)
+static std::string tokens_to_str(llama_context* ctx, Iter begin, Iter end)
 {
     std::string ret;
     for (; begin != end; ++begin)
@@ -214,6 +201,8 @@ struct llama_rn_context
     common_params params;
+    common_init_result llama_init;
     llama_model *model = nullptr;
     float loading_progress = 0;
     bool is_load_interrupted = false;
@@ -230,18 +219,10 @@ struct llama_rn_context
     std::string stopping_word;
     bool incomplete = false;
+    std::vector<common_lora_adapter_info> lora;
     ~llama_rn_context()
     {
-        if (ctx)
-        {
-            llama_free(ctx);
-            ctx = nullptr;
-        }
-        if (model)
-        {
-            llama_free_model(model);
-            model = nullptr;
-        }
         if (ctx_sampling != nullptr)
         {
             common_sampler_free(ctx_sampling);
@@ -280,30 +261,26 @@ struct llama_rn_context
     bool loadModel(common_params &params_)
     {
         params = params_;
-        common_init_result result = common_init_from_params(params);
-        model = result.model;
-        ctx = result.context;
+        llama_init = common_init_from_params(params);
+        model = llama_init.model.get();
+        ctx = llama_init.context.get();
         if (model == nullptr)
         {
            LOG_ERROR("unable to load model: %s", params_.model.c_str());
            return false;
         }
-        LOG_VERBOSE("getting n_ctx");
         n_ctx = llama_n_ctx(ctx);
+        // We can uncomment for debugging or after this fix: https://github.com/ggerganov/llama.cpp/pull/11101
+        // LOG_INFO("%s\n", common_params_get_system_info(params).c_str());
         return true;
     }
     bool validateModelChatTemplate() const {
-        std::vector<char> model_template(2048, 0); // longest known template is about 1200 bytes
-        std::string template_key = "tokenizer.chat_template";
-        int32_t res = llama_model_meta_val_str(model, template_key.c_str(), model_template.data(), model_template.size());
-        if (res >= 0) {
-            llama_chat_message chat[] = {{"user", "test"}};
-            std::string tmpl = std::string(model_template.data(), model_template.size());
-            int32_t chat_res = llama_chat_apply_template(model, tmpl.c_str(), chat, 1, true, nullptr, 0);
-            return chat_res > 0;
-        }
-        return res > 0;
+        llama_chat_message chat[] = {{"user", "test"}};
+        int32_t chat_res = llama_chat_apply_template(model, nullptr, chat, 1, true, nullptr, 0);
+        return chat_res > 0;
     }
     void truncatePrompt(std::vector<llama_token> &prompt_tokens) {
@@ -330,7 +307,7 @@ struct llama_rn_context
     void loadPrompt()
     {
-        std::vector<llama_token> prompt_tokens = ::common_tokenize(ctx, params.prompt, true, true);
+        std::vector<llama_token> prompt_tokens = ::common_tokenize(model, params.prompt, true, true);
         num_prompt_tokens = prompt_tokens.size();
         // LOG tokens
@@ -439,6 +416,7 @@ struct llama_rn_context
             }
             if (llama_decode(ctx, llama_batch_get_one(&embd[n_past], n_eval)))
             {
                 LOG_ERROR("failed to eval, n_eval: %d, n_past: %d, n_threads: %d, embd: %s",
                     n_eval,
                     n_past,
@@ -477,7 +455,7 @@ struct llama_rn_context
             const int32_t n_probs = params.sampling.n_probs;
             // deprecated
-            /*if (params.sparams.temp <= 0 && n_probs > 0)
+            /*if (params.sampling.temp <= 0 && n_probs > 0)
             {
                 // For llama_sample_token_greedy we need to sort candidates
                 llama_sampler_init_softmax();
@@ -647,7 +625,11 @@ struct llama_rn_context
         double tg_std = 0;
         // TODO: move batch into llama_rn_context (related https://github.com/mybigday/llama.rn/issues/30)
-        llama_batch batch = llama_batch_init(512, 0, 1);
+        llama_batch batch = llama_batch_init(
+            std::min(pp, params.n_ubatch), // max n_tokens is limited by n_ubatch
+            0,                         // No embeddings
+            1                          // Single sequence
+        );
         for (int i = 0; i < nr; i++)
         {
@@ -734,7 +716,27 @@ struct llama_rn_context
             std::string("]");
     }
+    int applyLoraAdapters(std::vector<common_lora_adapter_info> lora) {
+        for (auto &la : lora) {
+            la.ptr = llama_lora_adapter_init(model, la.path.c_str());
+            if (la.ptr == nullptr) {
+                LOG_ERROR("failed to apply lora adapter '%s'\n", la.path.c_str());
+                return -1;
+            }
+        }
+        this->lora = lora;
+        common_lora_adapters_apply(ctx, lora);
+        return 0;
+    }
+    void removeLoraAdapters() {
+        this->lora.clear();
+        common_lora_adapters_apply(ctx, this->lora); // apply empty list
+    }
+    std::vector<common_lora_adapter_info> getLoadedLoraAdapters() {
+        return this->lora;
+    }
 // Context Shifting from KoboldCpp <https://github.com/LostRuins/koboldcpp>
 // Implementation obtained with special permission from @concedo
@@ -897,6 +899,7 @@ void purge_missing_tokens(llama_context * ctx, std::vector<int> &current_context
 }
 // End Context Shifting
 };
 }