npm - cui-llama.rn - Versions diffs - 1.1.4 → 1.1.6 - Mend

cui-llama.rn 1.1.4 → 1.1.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (20) hide show

package/android/src/main/CMakeLists.txt +1 -0
package/android/src/main/jni.cpp +3 -4
package/cpp/common.cpp +183 -1990
package/cpp/common.h +101 -130
package/cpp/ggml-impl.h +32 -0
package/cpp/ggml-metal.m +38 -28
package/cpp/ggml-quants.c +275 -84
package/cpp/ggml.c +89 -35
package/cpp/ggml.h +30 -67
package/cpp/llama-impl.h +1 -0
package/cpp/llama-sampling.cpp +218 -102
package/cpp/llama.cpp +599 -120
package/cpp/llama.h +33 -25
package/cpp/log.cpp +401 -0
package/cpp/log.h +85 -703
package/cpp/rn-llama.hpp +9 -11
package/cpp/sampling.cpp +12 -9
package/cpp/sampling.h +4 -56
package/cpp/sgemm.cpp +38 -0
package/package.json +1 -1

package/cpp/rn-llama.hpp CHANGED Viewed

@@ -5,10 +5,7 @@
 #include <iostream>
 #include "common.h"
 #include "llama.h"
-#include <android/log.h>
-#define LLAMA_ANDROID_TAG "RNLLAMA_LOG_ANDROID"
-#define LLAMA_LOG_INFO(...)  __android_log_print(ANDROID_LOG_INFO , LLAMA_ANDROID_TAG, __VA_ARGS__)
+#include "sampling.h"
 namespace rnllama {
@@ -27,6 +24,7 @@ static void llama_batch_add(llama_batch *batch, llama_token id, llama_pos pos, s
     batch->n_tokens += 1;
 }
 // NOTE: Edit from https://github.com/ggerganov/llama.cpp/blob/master/examples/server/server.cpp
 static void log(const char *level, const char *function, int line,
@@ -308,9 +306,9 @@ struct llama_rn_context
         }
         // compare the evaluated prompt with the new prompt
         n_past = params.embedding? 0 :  common_part(embd, prompt_tokens);
-        LLAMA_LOG_INFO("%s:        n_past: %zu", __func__,  n_past);
-        LLAMA_LOG_INFO("%s:        embd size: %zu", __func__,  embd.size());
-        LLAMA_LOG_INFO("%s:        prompt_tokens size: %zu", __func__,  prompt_tokens.size());
+        LOG_INFO("%s:        n_past: %zu", __func__,  n_past);
+        LOG_INFO("%s:        embd size: %zu", __func__,  embd.size());
+        LOG_INFO("%s:        prompt_tokens size: %zu", __func__,  prompt_tokens.size());
         embd = prompt_tokens;
         if (n_past == num_prompt_tokens)
         {
@@ -334,7 +332,7 @@ struct llama_rn_context
     {
         // number of tokens to keep when resetting context
         n_remain = params.n_predict;
-        llama_perf_reset(ctx, LLAMA_PERF_TYPE_CONTEXT);
+        llama_perf_context_reset(ctx);
         is_predicting = true;
     }
@@ -391,7 +389,7 @@ struct llama_rn_context
             n_past += n_eval;
             if(is_interrupted) {
-                LOG("Decoding Interrupted");
+                LOG_INFO("Decoding Interrupted");
                 embd.resize(n_past);
                 has_next_token = false;
                 return result;
@@ -797,7 +795,7 @@ void purge_missing_tokens(llama_context * ctx, std::vector<int> &current_context
     if(!purge_needed || new_tokens_len < 6 || current_context_tokens.size() < 6 || new_tokens_len - trimstart < short_fall_threshold)
     {
-        LLAMA_LOG_INFO("Fall Threshold: %d out of %d\n", new_tokens_len - trimstart, short_fall_threshold);
+        LOG_INFO("Fall Threshold: %d out of %d\n", new_tokens_len - trimstart, short_fall_threshold);
         return; //no purge is needed
     }
@@ -825,7 +823,7 @@ void purge_missing_tokens(llama_context * ctx, std::vector<int> &current_context
                 current_context_tokens[i - diff] = current_context_tokens[i];
             }
-            LLAMA_LOG_INFO("\n[Context Shifting: Erased %d tokens at position %d]", diff, trimstart + 1);
+            LOG_INFO("\n[Context Shifting: Erased %d tokens at position %d]", diff, trimstart + 1);
             current_context_tokens.resize(current_context_tokens.size() - diff);
         }

package/cpp/sampling.cpp CHANGED Viewed

@@ -2,6 +2,9 @@
 #include "common.h"
+#include <cmath>
+#include <unordered_map>
 // the ring buffer works similarly to std::deque, but with a fixed capacity
 // TODO: deduplicate with llama-impl.h
 template<typename T>
@@ -139,7 +142,7 @@ std::string gpt_sampler_params::print() const {
 struct gpt_sampler * gpt_sampler_init(const struct llama_model * model, const struct gpt_sampler_params & params) {
     llama_sampler_chain_params lparams = llama_sampler_chain_default_params();
-    lparams.no_perf = false; // TODO: control via params
+    lparams.no_perf = params.no_perf;
     auto * result = new gpt_sampler {
         /* .params = */ params,
@@ -257,10 +260,10 @@ void gpt_perf_print(const struct llama_context * ctx, const struct gpt_sampler *
     // TODO: measure grammar performance
     if (gsmpl) {
-        llama_perf_print(gsmpl->chain, LLAMA_PERF_TYPE_SAMPLER_CHAIN);
+        llama_perf_sampler_print(gsmpl->chain);
     }
     if (ctx) {
-        llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT);
+        llama_perf_context_print(ctx);
     }
 }
@@ -310,6 +313,10 @@ llama_token gpt_sampler_sample(struct gpt_sampler * gsmpl, struct llama_context
     return cur_p.data[cur_p.selected].id;
 }
+uint32_t gpt_sampler_get_seed(const struct gpt_sampler * gsmpl) {
+    return llama_sampler_get_seed(gsmpl->chain);
+}
 // helpers
 llama_token_data_array * gpt_sampler_get_candidates(struct gpt_sampler * gsmpl) {
@@ -321,7 +328,7 @@ llama_token gpt_sampler_last(const struct gpt_sampler * gsmpl) {
 }
 std::string gpt_sampler_print(const struct gpt_sampler * gsmpl) {
-    std::string result = "\tlogits ";
+    std::string result = "logits ";
     for (int i = 0; i < llama_sampler_chain_n(gsmpl->chain); i++) {
         const auto * smpl = llama_sampler_chain_get(gsmpl->chain, i);
@@ -352,10 +359,6 @@ std::string gpt_sampler_prev_str(gpt_sampler * gsmpl, llama_context * ctx_main,
     return result;
 }
-struct llama_sampler_timings gpt_sampler_get_timigs(const struct gpt_sampler * gsmpl){
-    return llama_sampler_chain_timings(gsmpl -> chain);
-}
 char gpt_sampler_type_to_chr(enum gpt_sampler_type cnstr) {
     switch (cnstr) {
         case GPT_SAMPLER_TYPE_TOP_K:       return 'k';
@@ -432,7 +435,7 @@ std::vector<gpt_sampler_type> gpt_sampler_types_from_names(const std::vector<std
 }
 std::vector<gpt_sampler_type> gpt_sampler_types_from_chars(const std::string & chars) {
-    std::unordered_map<char, gpt_sampler_type> sampler_name_map {
+    std::unordered_map<char, gpt_sampler_type> sampler_name_map = {
         { gpt_sampler_type_to_chr(GPT_SAMPLER_TYPE_TOP_K),       GPT_SAMPLER_TYPE_TOP_K },
         { gpt_sampler_type_to_chr(GPT_SAMPLER_TYPE_TFS_Z),       GPT_SAMPLER_TYPE_TFS_Z },
         { gpt_sampler_type_to_chr(GPT_SAMPLER_TYPE_TYPICAL_P),   GPT_SAMPLER_TYPE_TYPICAL_P },

package/cpp/sampling.h CHANGED Viewed

@@ -2,65 +2,11 @@
 #include "llama.h"
+#include "common.h"
 #include <string>
 #include <vector>
-enum gpt_sampler_type {
-    GPT_SAMPLER_TYPE_NONE        = 0,
-    GPT_SAMPLER_TYPE_TOP_K       = 1,
-    GPT_SAMPLER_TYPE_TOP_P       = 2,
-    GPT_SAMPLER_TYPE_MIN_P       = 3,
-    GPT_SAMPLER_TYPE_TFS_Z       = 4,
-    GPT_SAMPLER_TYPE_TYPICAL_P   = 5,
-    GPT_SAMPLER_TYPE_TEMPERATURE = 6,
-    GPT_SAMPLER_TYPE_XTC         = 7,
-};
-// sampling parameters
-struct gpt_sampler_params {
-    uint32_t seed = LLAMA_DEFAULT_SEED; // the seed used to initialize llama_sampler
-    int32_t n_prev            = 64;    // number of previous tokens to remember
-    int32_t n_probs           = 0;     // if greater than 0, output the probabilities of top n_probs tokens.
-    int32_t min_keep          = 0;     // 0 = disabled, otherwise samplers should return at least min_keep tokens
-    int32_t top_k             = 40;    // <= 0 to use vocab size
-    float   top_p             = 0.95f; // 1.0 = disabled
-    float   min_p             = 0.05f; // 0.0 = disabled
-    float   tfs_z             = 1.00f; // 1.0 = disabled
-    float   xtc_t             = 0.0f;  // 0.0 = disabled
-    float   xtc_p             = 0.0f;
-    float   typ_p             = 1.00f; // typical_p, 1.0 = disabled
-    float   temp              = 0.80f; // <= 0.0 to sample greedily, 0.0 to not output probabilities
-    float   dynatemp_range    = 0.00f; // 0.0 = disabled
-    float   dynatemp_exponent = 1.00f; // controls how entropy maps to temperature in dynamic temperature sampler
-    int32_t penalty_last_n    = 64;    // last n tokens to penalize (0 = disable penalty, -1 = context size)
-    float   penalty_repeat    = 1.00f; // 1.0 = disabled
-    float   penalty_freq      = 0.00f; // 0.0 = disabled
-    float   penalty_present   = 0.00f; // 0.0 = disabled
-    int32_t mirostat          = 0;     // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
-    float   mirostat_tau      = 5.00f; // target entropy
-    float   mirostat_eta      = 0.10f; // learning rate
-    bool    penalize_nl       = false; // consider newlines as a repeatable token
-    bool    ignore_eos        = false;
-    std::vector<enum gpt_sampler_type> samplers = {
-        GPT_SAMPLER_TYPE_TOP_K,
-        GPT_SAMPLER_TYPE_TFS_Z,
-        GPT_SAMPLER_TYPE_TYPICAL_P,
-        GPT_SAMPLER_TYPE_TOP_P,
-        GPT_SAMPLER_TYPE_MIN_P,
-        GPT_SAMPLER_TYPE_XTC,
-        GPT_SAMPLER_TYPE_TEMPERATURE
-    };
-    std::string grammar; // optional BNF-like grammar to constrain sampling
-    std::vector<llama_logit_bias> logit_bias; // logit biases to apply
-    // print the parameters into a string
-    std::string print() const;
-};
 // gpt_sampler extends llama_sampler with additional functionality:
 //
 //  - grammar support
@@ -114,6 +60,8 @@ void gpt_perf_print(const struct llama_context * ctx, const struct gpt_sampler *
 //
 llama_token gpt_sampler_sample(struct gpt_sampler * gsmpl, struct llama_context * ctx, int idx, bool grammar_first = false);
+uint32_t gpt_sampler_get_seed(const struct gpt_sampler * gsmpl);
 // helpers
 // access the internal list of current candidate tokens

package/cpp/sgemm.cpp CHANGED Viewed

@@ -235,6 +235,14 @@ template <> inline __m512 load(const lm_ggml_fp16_t *p) {
 }
 #endif // __AVX512F__
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// CONSTANTS
+#if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__)
+static const int8_t kvalues_iq4nl[16] = {-127, -104, -83, -65, -49, -35, -22, -10, 1, 13, 25, 38, 53, 69, 89, 113};
+static const __m128i iq4nlt = _mm_loadu_si128((const __m128i *) kvalues_iq4nl);
+#endif
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 // FLOATING POINT MATRIX MULTIPLICATION
@@ -933,6 +941,20 @@ class tinyBLAS_Q0_AVX {
         return _mm_sub_epi8(_mm_and_si128(_mm_set1_epi8(15), _mm_srli_epi16(x, 4)), _mm_set1_epi8(8));
     }
+    inline __m256i load(const block_iq4_nl *b) {
+        return MM256_SET_M128I(load1(b), load0(b));
+    }
+    inline __m128i load0(const block_iq4_nl *b) {
+        const __m128i x = _mm_loadu_si128((const __m128i *)(b->qs));
+        return _mm_shuffle_epi8(iq4nlt, _mm_and_si128(_mm_set1_epi8(15), x));
+    }
+    inline __m128i load1(const block_iq4_nl *b) {
+        const __m128i x = _mm_loadu_si128((const __m128i *)(b->qs));
+        return _mm_shuffle_epi8(iq4nlt, _mm_and_si128(_mm_set1_epi8(15), _mm_srli_epi16(x, 4)));
+    }
     inline __m256 updot(__m256i u, __m256i s) {
         __m256i res;
 #if defined(__AVXVNNI__) || (defined(__AVX512VNNI__) && defined(__AVX512VL__))
@@ -1159,6 +1181,22 @@ bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda
 #endif
     }
+    case LM_GGML_TYPE_IQ4_NL: {
+        if (Btype != LM_GGML_TYPE_Q8_0)
+            return false;
+#if defined(__AVX2__) || defined(__AVX512F__) || defined(__AVX__)
+        tinyBLAS_Q0_AVX<block_iq4_nl, block_q8_0, float> tb{
+            k, (const block_iq4_nl *)A, lda,
+            (const block_q8_0 *)B, ldb,
+            (float *)C, ldc,
+            ith, nth};
+        tb.matmul(m, n);
+        return true;
+#else
+        return false;
+#endif
+    }
     default:
         return false;
     }

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "cui-llama.rn",
-  "version": "1.1.4",
+  "version": "1.1.6",
   "description": "Fork of llama.rn for ChatterUI",
   "main": "lib/commonjs/index",
   "module": "lib/module/index",