npm - cui-llama.rn - Versions diffs - 1.1.5 → 1.1.7 - Mend

cui-llama.rn 1.1.5 → 1.1.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (17) hide show

package/android/src/main/CMakeLists.txt +1 -0
package/android/src/main/jni.cpp +1 -4
package/cpp/common.cpp +157 -53
package/cpp/common.h +11 -3
package/cpp/ggml-metal.m +33 -22
package/cpp/ggml-quants.c +33 -36
package/cpp/ggml.h +5 -4
package/cpp/llama-impl.h +1 -0
package/cpp/llama-sampling.cpp +0 -8
package/cpp/llama.cpp +519 -34
package/cpp/llama.h +0 -17
package/cpp/log.cpp +401 -0
package/cpp/log.h +85 -703
package/cpp/rn-llama.hpp +7 -10
package/cpp/sampling.cpp +1 -5
package/cpp/sgemm.cpp +38 -0
package/package.json +1 -1

package/cpp/rn-llama.hpp CHANGED Viewed

@@ -5,11 +5,7 @@
 #include <iostream>
 #include "common.h"
 #include "llama.h"
-#include <android/log.h>
 #include "sampling.h"
-#define LLAMA_ANDROID_TAG "RNLLAMA_LOG_ANDROID"
-#define LLAMA_LOG_INFO(...)  __android_log_print(ANDROID_LOG_INFO , LLAMA_ANDROID_TAG, __VA_ARGS__)
 namespace rnllama {
@@ -28,6 +24,7 @@ static void llama_batch_add(llama_batch *batch, llama_token id, llama_pos pos, s
     batch->n_tokens += 1;
 }
 // NOTE: Edit from https://github.com/ggerganov/llama.cpp/blob/master/examples/server/server.cpp
 static void log(const char *level, const char *function, int line,
@@ -309,9 +306,9 @@ struct llama_rn_context
         }
         // compare the evaluated prompt with the new prompt
         n_past = params.embedding? 0 :  common_part(embd, prompt_tokens);
-        LLAMA_LOG_INFO("%s:        n_past: %zu", __func__,  n_past);
-        LLAMA_LOG_INFO("%s:        embd size: %zu", __func__,  embd.size());
-        LLAMA_LOG_INFO("%s:        prompt_tokens size: %zu", __func__,  prompt_tokens.size());
+        LOG_INFO("%s:        n_past: %zu", __func__,  n_past);
+        LOG_INFO("%s:        embd size: %zu", __func__,  embd.size());
+        LOG_INFO("%s:        prompt_tokens size: %zu", __func__,  prompt_tokens.size());
         embd = prompt_tokens;
         if (n_past == num_prompt_tokens)
         {
@@ -392,7 +389,7 @@ struct llama_rn_context
             n_past += n_eval;
             if(is_interrupted) {
-                LOG("Decoding Interrupted");
+                LOG_INFO("Decoding Interrupted");
                 embd.resize(n_past);
                 has_next_token = false;
                 return result;
@@ -798,7 +795,7 @@ void purge_missing_tokens(llama_context * ctx, std::vector<int> &current_context
     if(!purge_needed || new_tokens_len < 6 || current_context_tokens.size() < 6 || new_tokens_len - trimstart < short_fall_threshold)
     {
-        LLAMA_LOG_INFO("Fall Threshold: %d out of %d\n", new_tokens_len - trimstart, short_fall_threshold);
+        LOG_INFO("Fall Threshold: %d out of %d\n", new_tokens_len - trimstart, short_fall_threshold);
         return; //no purge is needed
     }
@@ -826,7 +823,7 @@ void purge_missing_tokens(llama_context * ctx, std::vector<int> &current_context
                 current_context_tokens[i - diff] = current_context_tokens[i];
             }
-            LLAMA_LOG_INFO("\n[Context Shifting: Erased %d tokens at position %d]", diff, trimstart + 1);
+            LOG_INFO("\n[Context Shifting: Erased %d tokens at position %d]", diff, trimstart + 1);
             current_context_tokens.resize(current_context_tokens.size() - diff);
         }

package/cpp/sampling.cpp CHANGED Viewed

@@ -328,7 +328,7 @@ llama_token gpt_sampler_last(const struct gpt_sampler * gsmpl) {
 }
 std::string gpt_sampler_print(const struct gpt_sampler * gsmpl) {
-    std::string result = "\tlogits ";
+    std::string result = "logits ";
     for (int i = 0; i < llama_sampler_chain_n(gsmpl->chain); i++) {
         const auto * smpl = llama_sampler_chain_get(gsmpl->chain, i);
@@ -359,10 +359,6 @@ std::string gpt_sampler_prev_str(gpt_sampler * gsmpl, llama_context * ctx_main,
     return result;
 }
-struct llama_sampler_timings gpt_sampler_get_timigs(const struct gpt_sampler * gsmpl){
-    return llama_sampler_chain_timings(gsmpl -> chain);
-}
 char gpt_sampler_type_to_chr(enum gpt_sampler_type cnstr) {
     switch (cnstr) {
         case GPT_SAMPLER_TYPE_TOP_K:       return 'k';

package/cpp/sgemm.cpp CHANGED Viewed

@@ -235,6 +235,14 @@ template <> inline __m512 load(const lm_ggml_fp16_t *p) {
 }
 #endif // __AVX512F__
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// CONSTANTS
+#if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__)
+static const int8_t kvalues_iq4nl[16] = {-127, -104, -83, -65, -49, -35, -22, -10, 1, 13, 25, 38, 53, 69, 89, 113};
+static const __m128i iq4nlt = _mm_loadu_si128((const __m128i *) kvalues_iq4nl);
+#endif
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 // FLOATING POINT MATRIX MULTIPLICATION
@@ -933,6 +941,20 @@ class tinyBLAS_Q0_AVX {
         return _mm_sub_epi8(_mm_and_si128(_mm_set1_epi8(15), _mm_srli_epi16(x, 4)), _mm_set1_epi8(8));
     }
+    inline __m256i load(const block_iq4_nl *b) {
+        return MM256_SET_M128I(load1(b), load0(b));
+    }
+    inline __m128i load0(const block_iq4_nl *b) {
+        const __m128i x = _mm_loadu_si128((const __m128i *)(b->qs));
+        return _mm_shuffle_epi8(iq4nlt, _mm_and_si128(_mm_set1_epi8(15), x));
+    }
+    inline __m128i load1(const block_iq4_nl *b) {
+        const __m128i x = _mm_loadu_si128((const __m128i *)(b->qs));
+        return _mm_shuffle_epi8(iq4nlt, _mm_and_si128(_mm_set1_epi8(15), _mm_srli_epi16(x, 4)));
+    }
     inline __m256 updot(__m256i u, __m256i s) {
         __m256i res;
 #if defined(__AVXVNNI__) || (defined(__AVX512VNNI__) && defined(__AVX512VL__))
@@ -1159,6 +1181,22 @@ bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda
 #endif
     }
+    case LM_GGML_TYPE_IQ4_NL: {
+        if (Btype != LM_GGML_TYPE_Q8_0)
+            return false;
+#if defined(__AVX2__) || defined(__AVX512F__) || defined(__AVX__)
+        tinyBLAS_Q0_AVX<block_iq4_nl, block_q8_0, float> tb{
+            k, (const block_iq4_nl *)A, lda,
+            (const block_q8_0 *)B, ldb,
+            (float *)C, ldc,
+            ith, nth};
+        tb.matmul(m, n);
+        return true;
+#else
+        return false;
+#endif
+    }
     default:
         return false;
     }

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "cui-llama.rn",
-  "version": "1.1.5",
+  "version": "1.1.7",
   "description": "Fork of llama.rn for ChatterUI",
   "main": "lib/commonjs/index",
   "module": "lib/module/index",