npm - cui-llama.rn - Versions diffs - 1.1.2 → 1.1.5 - Mend

cui-llama.rn 1.1.2 → 1.1.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (31) hide show

package/android/src/main/CMakeLists.txt +1 -2
package/android/src/main/jni.cpp +26 -21
package/cpp/common.cpp +181 -1584
package/cpp/common.h +131 -52
package/cpp/ggml-aarch64.c +612 -0
package/cpp/ggml-alloc.h +2 -2
package/cpp/ggml-backend.c +33 -6
package/cpp/ggml-backend.h +2 -0
package/cpp/ggml-common.h +20 -0
package/cpp/ggml-impl.h +36 -7
package/cpp/ggml-metal.m +68 -8
package/cpp/ggml-quants.c +932 -50
package/cpp/ggml-quants.h +15 -0
package/cpp/ggml.c +1712 -325
package/cpp/ggml.h +169 -100
package/cpp/llama-grammar.cpp +721 -122
package/cpp/llama-grammar.h +120 -15
package/cpp/llama-impl.h +132 -1
package/cpp/llama-sampling.cpp +1483 -354
package/cpp/llama-sampling.h +20 -48
package/cpp/llama-vocab.cpp +140 -7
package/cpp/llama-vocab.h +3 -2
package/cpp/llama.cpp +824 -327
package/cpp/llama.h +235 -256
package/cpp/rn-llama.hpp +18 -14
package/cpp/sampling.cpp +353 -354
package/cpp/sampling.h +62 -143
package/cpp/sgemm.cpp +153 -0
package/package.json +1 -1
package/cpp/grammar-parser.cpp +0 -539
package/cpp/grammar-parser.h +0 -29

package/cpp/rn-llama.hpp CHANGED Viewed

@@ -7,6 +7,7 @@
 #include "llama.h"
 #include <android/log.h>
+#include "sampling.h"
 #define LLAMA_ANDROID_TAG "RNLLAMA_LOG_ANDROID"
 #define LLAMA_LOG_INFO(...)  __android_log_print(ANDROID_LOG_INFO , LLAMA_ANDROID_TAG, __VA_ARGS__)
@@ -163,8 +164,8 @@ struct llama_rn_context
     llama_model *model = nullptr;
     llama_context *ctx = nullptr;
-    llama_sampling_context *ctx_sampling = nullptr;
+    gpt_sampler *ctx_sampling = nullptr;
     int n_ctx;
     bool truncated = false;
@@ -188,7 +189,7 @@ struct llama_rn_context
         }
         if (ctx_sampling != nullptr)
         {
-            llama_sampling_free(ctx_sampling);
+            gpt_sampler_free(ctx_sampling);
         }
     }
@@ -215,9 +216,9 @@ struct llama_rn_context
     bool initSampling() {
         if (ctx_sampling != nullptr) {
-            llama_sampling_free(ctx_sampling);
+            gpt_sampler_free(ctx_sampling);
         }
-        ctx_sampling = llama_sampling_init(params.sparams);
+        ctx_sampling = gpt_sampler_init(model, params.sparams);
         return ctx_sampling != nullptr;
     }
@@ -304,7 +305,7 @@ struct llama_rn_context
         // push the prompt into the sampling context (do not apply grammar)
         for (auto & token : prompt_tokens)
         {
-           llama_sampling_accept(ctx_sampling, ctx, token, false);
+           gpt_sampler_accept(ctx_sampling, token, false);
         }
         // compare the evaluated prompt with the new prompt
         n_past = params.embedding? 0 :  common_part(embd, prompt_tokens);
@@ -334,8 +335,7 @@ struct llama_rn_context
     {
         // number of tokens to keep when resetting context
         n_remain = params.n_predict;
-        llama_set_rng_seed(ctx, params.seed);
+        llama_perf_context_reset(ctx);
         is_predicting = true;
     }
@@ -383,7 +383,7 @@ struct llama_rn_context
                 LOG_ERROR("failed to eval, n_eval: %d, n_past: %d, n_threads: %d, embd: %s",
                     n_eval,
                     n_past,
-                    params.n_threads,
+                    params.cpuparams.n_threads,
                     tokens_to_str(ctx, embd.cbegin() + n_past, embd.cend()).c_str()
                 );
                 has_next_token = false;
@@ -411,22 +411,26 @@ struct llama_rn_context
             std::vector<llama_token_data> candidates;
             candidates.reserve(llama_n_vocab(model));
-            result.tok = llama_sampling_sample(ctx_sampling, ctx, NULL);
-            llama_token_data_array cur_p = { ctx_sampling->cur.data(), ctx_sampling->cur.size(), false };
+            result.tok = gpt_sampler_sample(ctx_sampling, ctx, -1);
+            llama_token_data_array cur_p = *gpt_sampler_get_candidates(ctx_sampling);
             const int32_t n_probs = params.sparams.n_probs;
             if (params.sparams.temp <= 0 && n_probs > 0)
             {
                 // For llama_sample_token_greedy we need to sort candidates
-                llama_sample_softmax(ctx, &cur_p);
+                llama_sampler_init_softmax();
             }
             for (size_t i = 0; i < std::min(cur_p.size, (size_t)n_probs); ++i)
             {
                 result.probs.push_back({cur_p.data[i].id, cur_p.data[i].p});
             }
-            llama_sampling_accept(ctx_sampling, ctx, result.tok, true);
+            gpt_sampler_accept(ctx_sampling, result.tok, true);
             if (tg) {
                 num_tokens_predicted++;
             }