npm - cui-llama.rn - Versions diffs - 1.1.2 → 1.1.4 - Mend

cui-llama.rn 1.1.2 → 1.1.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (31) hide show

package/android/src/main/CMakeLists.txt +1 -2
package/android/src/main/jni.cpp +26 -21
package/cpp/common.cpp +2028 -1520
package/cpp/common.h +134 -18
package/cpp/ggml-aarch64.c +612 -0
package/cpp/ggml-alloc.h +2 -2
package/cpp/ggml-backend.c +33 -6
package/cpp/ggml-backend.h +2 -0
package/cpp/ggml-common.h +20 -0
package/cpp/ggml-impl.h +4 -7
package/cpp/ggml-metal.m +63 -2
package/cpp/ggml-quants.c +690 -2
package/cpp/ggml-quants.h +15 -0
package/cpp/ggml.c +1650 -317
package/cpp/ggml.h +155 -48
package/cpp/llama-grammar.cpp +721 -122
package/cpp/llama-grammar.h +120 -15
package/cpp/llama-impl.h +132 -1
package/cpp/llama-sampling.cpp +1361 -356
package/cpp/llama-sampling.h +20 -48
package/cpp/llama-vocab.cpp +140 -7
package/cpp/llama-vocab.h +3 -2
package/cpp/llama.cpp +810 -307
package/cpp/llama.h +213 -259
package/cpp/rn-llama.hpp +17 -14
package/cpp/sampling.cpp +347 -355
package/cpp/sampling.h +106 -135
package/cpp/sgemm.cpp +153 -0
package/package.json +1 -1
package/cpp/grammar-parser.cpp +0 -539
package/cpp/grammar-parser.h +0 -29

package/cpp/rn-llama.hpp CHANGED Viewed

@@ -163,8 +163,8 @@ struct llama_rn_context
     llama_model *model = nullptr;
     llama_context *ctx = nullptr;
-    llama_sampling_context *ctx_sampling = nullptr;
+    gpt_sampler *ctx_sampling = nullptr;
     int n_ctx;
     bool truncated = false;
@@ -188,7 +188,7 @@ struct llama_rn_context
         }
         if (ctx_sampling != nullptr)
         {
-            llama_sampling_free(ctx_sampling);
+            gpt_sampler_free(ctx_sampling);
         }
     }
@@ -215,9 +215,9 @@ struct llama_rn_context
     bool initSampling() {
         if (ctx_sampling != nullptr) {
-            llama_sampling_free(ctx_sampling);
+            gpt_sampler_free(ctx_sampling);
         }
-        ctx_sampling = llama_sampling_init(params.sparams);
+        ctx_sampling = gpt_sampler_init(model, params.sparams);
         return ctx_sampling != nullptr;
     }
@@ -304,7 +304,7 @@ struct llama_rn_context
         // push the prompt into the sampling context (do not apply grammar)
         for (auto & token : prompt_tokens)
         {
-           llama_sampling_accept(ctx_sampling, ctx, token, false);
+           gpt_sampler_accept(ctx_sampling, token, false);
         }
         // compare the evaluated prompt with the new prompt
         n_past = params.embedding? 0 :  common_part(embd, prompt_tokens);
@@ -334,8 +334,7 @@ struct llama_rn_context
     {
         // number of tokens to keep when resetting context
         n_remain = params.n_predict;
-        llama_set_rng_seed(ctx, params.seed);
+        llama_perf_reset(ctx, LLAMA_PERF_TYPE_CONTEXT);
         is_predicting = true;
     }
@@ -383,7 +382,7 @@ struct llama_rn_context
                 LOG_ERROR("failed to eval, n_eval: %d, n_past: %d, n_threads: %d, embd: %s",
                     n_eval,
                     n_past,
-                    params.n_threads,
+                    params.cpuparams.n_threads,
                     tokens_to_str(ctx, embd.cbegin() + n_past, embd.cend()).c_str()
                 );
                 has_next_token = false;
@@ -411,22 +410,26 @@ struct llama_rn_context
             std::vector<llama_token_data> candidates;
             candidates.reserve(llama_n_vocab(model));
-            result.tok = llama_sampling_sample(ctx_sampling, ctx, NULL);
-            llama_token_data_array cur_p = { ctx_sampling->cur.data(), ctx_sampling->cur.size(), false };
+            result.tok = gpt_sampler_sample(ctx_sampling, ctx, -1);
+            llama_token_data_array cur_p = *gpt_sampler_get_candidates(ctx_sampling);
             const int32_t n_probs = params.sparams.n_probs;
             if (params.sparams.temp <= 0 && n_probs > 0)
             {
                 // For llama_sample_token_greedy we need to sort candidates
-                llama_sample_softmax(ctx, &cur_p);
+                llama_sampler_init_softmax();
             }
             for (size_t i = 0; i < std::min(cur_p.size, (size_t)n_probs); ++i)
             {
                 result.probs.push_back({cur_p.data[i].id, cur_p.data[i].p});
             }
-            llama_sampling_accept(ctx_sampling, ctx, result.tok, true);
+            gpt_sampler_accept(ctx_sampling, result.tok, true);
             if (tg) {
                 num_tokens_predicted++;
             }