npm - cui-llama.rn - Versions diffs - 1.0.3 → 1.0.6 - Mend

cui-llama.rn 1.0.3 → 1.0.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (62) hide show

package/README.md +35 -39
package/android/src/main/CMakeLists.txt +12 -2
package/android/src/main/java/com/rnllama/LlamaContext.java +29 -9
package/android/src/main/java/com/rnllama/RNLlama.java +33 -1
package/android/src/main/jni.cpp +62 -8
package/android/src/newarch/java/com/rnllama/RNLlamaModule.java +5 -0
package/android/src/oldarch/java/com/rnllama/RNLlamaModule.java +5 -0
package/cpp/common.cpp +3237 -3231
package/cpp/common.h +469 -468
package/cpp/ggml-aarch64.c +2193 -2193
package/cpp/ggml-aarch64.h +39 -39
package/cpp/ggml-alloc.c +1036 -1042
package/cpp/ggml-backend-impl.h +153 -153
package/cpp/ggml-backend.c +2240 -2234
package/cpp/ggml-backend.h +238 -238
package/cpp/ggml-common.h +1833 -1829
package/cpp/ggml-impl.h +755 -655
package/cpp/ggml-metal.h +65 -65
package/cpp/ggml-metal.m +3269 -3269
package/cpp/ggml-quants.c +14872 -14860
package/cpp/ggml-quants.h +132 -132
package/cpp/ggml.c +22055 -22044
package/cpp/ggml.h +2453 -2447
package/cpp/llama-grammar.cpp +539 -0
package/cpp/llama-grammar.h +39 -0
package/cpp/llama-impl.h +26 -0
package/cpp/llama-sampling.cpp +635 -0
package/cpp/llama-sampling.h +56 -0
package/cpp/llama-vocab.cpp +1721 -0
package/cpp/llama-vocab.h +130 -0
package/cpp/llama.cpp +19171 -21892
package/cpp/llama.h +1240 -1217
package/cpp/log.h +737 -737
package/cpp/rn-llama.hpp +207 -29
package/cpp/sampling.cpp +460 -460
package/cpp/sgemm.cpp +1027 -1027
package/cpp/sgemm.h +14 -14
package/cpp/unicode.cpp +6 -0
package/cpp/unicode.h +3 -0
package/ios/RNLlama.mm +15 -6
package/ios/RNLlamaContext.h +2 -8
package/ios/RNLlamaContext.mm +41 -34
package/lib/commonjs/NativeRNLlama.js.map +1 -1
package/lib/commonjs/chat.js +37 -0
package/lib/commonjs/chat.js.map +1 -0
package/lib/commonjs/index.js +14 -1
package/lib/commonjs/index.js.map +1 -1
package/lib/module/NativeRNLlama.js.map +1 -1
package/lib/module/chat.js +31 -0
package/lib/module/chat.js.map +1 -0
package/lib/module/index.js +14 -1
package/lib/module/index.js.map +1 -1
package/lib/typescript/NativeRNLlama.d.ts +5 -1
package/lib/typescript/NativeRNLlama.d.ts.map +1 -1
package/lib/typescript/chat.d.ts +10 -0
package/lib/typescript/chat.d.ts.map +1 -0
package/lib/typescript/index.d.ts +9 -2
package/lib/typescript/index.d.ts.map +1 -1
package/package.json +1 -1
package/src/NativeRNLlama.ts +10 -1
package/src/chat.ts +44 -0
package/src/index.ts +31 -4

package/cpp/rn-llama.hpp CHANGED Viewed

@@ -6,6 +6,13 @@
 #include "common.h"
 #include "llama.h"
+#include <android/log.h>
+#define LLAMA_ANDROID_TAG "RNLLAMA_LOG_ANDROID"
+#define LLAMA_LOG_INFO(...)  __android_log_print(ANDROID_LOG_INFO , LLAMA_ANDROID_TAG, __VA_ARGS__)
 namespace rnllama {
 static void llama_batch_clear(llama_batch *batch) {
@@ -139,6 +146,7 @@ static std::string tokens_to_str(llama_context *ctx, Iter begin, Iter end)
     return ret;
 }
 struct llama_rn_context
 {
     bool is_predicting = false;
@@ -167,7 +175,7 @@ struct llama_rn_context
     bool stopped_word = false;
     bool stopped_limit = false;
     std::string stopping_word;
-    int32_t multibyte_pending = 0;
+    bool incomplete = false;
     ~llama_rn_context()
     {
@@ -202,7 +210,7 @@ struct llama_rn_context
         stopped_word = false;
         stopped_limit = false;
         stopping_word = "";
-        multibyte_pending = 0;
+        incomplete = false;
         n_remain = 0;
         n_past = 0;
         params.sparams.n_prev = n_ctx;
@@ -229,6 +237,14 @@ struct llama_rn_context
         return true;
     }
+    bool validateModelChatTemplate() const {
+        llama_chat_message chat[] = {{"user", "test"}};
+        const int res = llama_chat_apply_template(model, nullptr, chat, 1, true, nullptr, 0);
+        return res > 0;
+    }
     void truncatePrompt(std::vector<llama_token> &prompt_tokens) {
         const int n_left = n_ctx - params.n_keep;
         const int n_block_size = n_left / 2;
@@ -278,15 +294,20 @@ struct llama_rn_context
             LM_GGML_ASSERT(num_prompt_tokens < (size_t) n_ctx);
         }
+        // do Context Shift , may be buggy! TODO: Verify functionality
+        purge_missing_tokens(ctx, embd, prompt_tokens, params.n_predict, params.n_ctx);
         // push the prompt into the sampling context (do not apply grammar)
         for (auto & token : prompt_tokens)
         {
            llama_sampling_accept(ctx_sampling, ctx, token, false);
         }
         // compare the evaluated prompt with the new prompt
         n_past = common_part(embd, prompt_tokens);
+        LLAMA_LOG_INFO("%s:        n_past: %zu", __func__,  n_past);
+        LLAMA_LOG_INFO("%s:        embd size: %zu", __func__,  embd.size());
+        LLAMA_LOG_INFO("%s:        prompt_tokens size: %zu", __func__,  prompt_tokens.size());
         embd = prompt_tokens;
         if (n_past == num_prompt_tokens)
         {
@@ -470,35 +491,28 @@ struct llama_rn_context
             generated_token_probs.push_back(token_with_probs);
         }
-        if (multibyte_pending > 0)
-        {
-            multibyte_pending -= token_text.size();
-        }
-        else if (token_text.size() == 1)
-        {
-            const char c = token_text[0];
-            // 2-byte characters: 110xxxxx 10xxxxxx
-            if ((c & 0xE0) == 0xC0)
-            {
-                multibyte_pending = 1;
-                // 3-byte characters: 1110xxxx 10xxxxxx 10xxxxxx
-            }
-            else if ((c & 0xF0) == 0xE0)
-            {
-                multibyte_pending = 2;
-                // 4-byte characters: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
+        // check if there is incomplete UTF-8 character at the end
+        for (unsigned i = 1; i < 5 && i <= generated_text.size(); ++i) {
+            unsigned char c = generated_text[generated_text.size() - i];
+            if ((c & 0xC0) == 0x80) {
+                // continuation byte: 10xxxxxx
+                continue;
             }
-            else if ((c & 0xF8) == 0xF0)
-            {
-                multibyte_pending = 3;
-            }
-            else
-            {
-                multibyte_pending = 0;
+            if ((c & 0xE0) == 0xC0) {
+                // 2-byte character: 110xxxxx ...
+                incomplete = i < 2;
+            } else if ((c & 0xF0) == 0xE0) {
+                // 3-byte character: 1110xxxx ...
+                incomplete = i < 3;
+            } else if ((c & 0xF8) == 0xF0) {
+                // 4-byte character: 11110xxx ...
+                incomplete = i < 4;
             }
+            // else 1-byte character or invalid byte
+            break;
         }
-        if (multibyte_pending > 0 && !has_next_token)
+        if (incomplete && !has_next_token)
         {
             has_next_token = true;
             n_remain++;
@@ -638,6 +652,170 @@ struct llama_rn_context
             std::to_string(tg_std) +
             std::string("]");
     }
+// Context Shifting from KoboldCpp <https://github.com/LostRuins/koboldcpp>
+// Implementation obtained with special permission from @concedo
+std::vector<int> longest_common_subseq(const std::vector<int> x, const std::vector<int> y){
+     int m = x.size(), n = y.size();
+     //int LCSuff[m+1][n+1];
+     std::vector<std::vector<int>> LCSuff(m+1, std::vector<int>(n+1));
+     for (int j = 0; j <= n; j++)
+         LCSuff[0][j] = 0;
+     for (int i = 0; i <= m; i++)
+         LCSuff[i][0] = 0;
+     for (int i = 1; i <= m; i++)
+     {
+         for (int j = 1; j <= n; j++)
+         {
+             if (x[i - 1] == y[j - 1])
+                 LCSuff[i][j] = LCSuff[i - 1][j - 1] + 1;
+             else
+                 LCSuff[i][j] = 0;
+         }
+     }
+     std::vector<int> longest;
+     for (int i = 1; i <= m; i++)
+     {
+         for (int j = 1; j <= n; j++)
+         {
+             if (LCSuff[i][j] > longest.size())
+             {
+                 auto off1 = ((i - LCSuff[i][j] + 1) - 1);
+                 auto off2 = off1 + LCSuff[i][j];
+                 longest.clear();
+                //  std::vector<int>().swap(longest);
+                 longest = std::vector<int>(x.begin() + off1, x.begin() + off2);
+                // x.substr((i - LCSuff[i][j] + 1) - 1, LCSuff[i][j]);
+             }
+         }
+     }
+     return longest;
+ }
+bool arr_start_with(const std::vector<int> targetArray, const std::vector<int> searchSeq)
+ {
+     int ss = searchSeq.size();
+     if(targetArray.size()<ss)
+     {
+         return false;
+     }
+     for(int i=0;i<ss;++i)
+     {
+         if(targetArray[i]!=searchSeq[i])
+         {
+             return false;
+         }
+     }
+     return true;
+ }
+int arr_find_index_of(const std::vector<int> targetArray, const std::vector<int> searchSeq)
+ {
+     int ss = searchSeq.size();
+     int tas = targetArray.size();
+     if(tas<ss)
+     {
+         return -1;
+     }
+     for(int i=0;i<tas;++i)
+     {
+         int srch = 0;
+         bool fail = false;
+         for(int srch=0;srch<ss;++srch)
+         {
+             if ((i + srch) >= tas || targetArray[i + srch] != searchSeq[srch])
+             {
+                 fail = true;
+                 break;
+             }
+         }
+         if(!fail)
+         {
+             return i;
+         }
+     }
+     return -1;
+ }
+void purge_missing_tokens(llama_context * ctx, std::vector<int> &current_context_tokens, std::vector<int> &new_context_tokens, const int genamt, const int nctx)
+{
+    //scan from start old and new ctx, until first mismatch found, save as p0
+    //check remaining old and new ctx for longest common subseq, which needs to be at 256 tokens
+    //test: longest common subseq (LCQ) MUST start within 0 tokens from end of memory, otherwise purge fails
+    //if passed, save beginning of LCQ from old ctx as p1
+    //remove all tokens from old ctx between p0 and p1, updating both arrays and kv, then continue as normal
+    const int short_fall_threshold = 200 + (nctx/30); //dont trigger shifting if the distance between trimstart and currhead < this
+    const int stack_allowance = 60 + (nctx/50); //in case the end text is slightly modified, be forgiving
+    int trimstart = 0;
+    int new_tokens_len = new_context_tokens.size();
+    bool purge_needed = true;
+    for (int i = 0; i < current_context_tokens.size(); ++i)
+    {
+        if (current_context_tokens[i] == new_context_tokens[i])
+        {
+            trimstart += 1;
+        }
+        else
+        {
+            break;
+        }
+        if ((i + 2) >= new_tokens_len)
+        {
+            purge_needed = false;
+            break; //no surgery required
+        }
+    }
+    if(!purge_needed || new_tokens_len < 6 || current_context_tokens.size() < 6 || new_tokens_len - trimstart < short_fall_threshold)
+    {
+        LLAMA_LOG_INFO("Fall Threshold: %d out of %d\n", new_tokens_len - trimstart, short_fall_threshold);
+        return; //no purge is needed
+    }
+    //at least this many tokens need to match, otherwise don't bother trimming
+    const int lc_tok_threshold = std::max(std::min((new_tokens_len - trimstart) - (genamt+stack_allowance), (int)(nctx*0.45)), short_fall_threshold - stack_allowance);
+    auto curr_ctx_without_memory = std::vector<int>(current_context_tokens.begin() + trimstart, current_context_tokens.end());
+    auto new_ctx_without_memory = std::vector<int>(new_context_tokens.begin() + trimstart, new_context_tokens.end());
+    auto shared = longest_common_subseq(curr_ctx_without_memory, new_ctx_without_memory);
+    if (shared.size() > lc_tok_threshold && arr_start_with(new_ctx_without_memory, shared)) // enough tokens in common
+    {
+        int found = arr_find_index_of(current_context_tokens,shared);
+        if(found>=0 && found > trimstart)
+        {
+            //extract the unwanted tokens out from context and KV
+            int diff = found - trimstart;
+            llama_kv_cache_seq_rm(ctx, 0, trimstart, trimstart + diff);
+            llama_kv_cache_seq_add(ctx, 0, trimstart + diff, -1, -diff);
+            for (size_t i = trimstart + diff; i < current_context_tokens.size() - 1; i++)
+            {
+                current_context_tokens[i - diff] = current_context_tokens[i];
+            }
+            LLAMA_LOG_INFO("\n[Context Shifting: Erased %d tokens at position %d]", diff, trimstart + 1);
+            current_context_tokens.resize(current_context_tokens.size() - diff);
+        }
+    }
+}
+// End Context Shifting
 };
 }