npm - cui-llama.rn - Versions diffs - 1.0.7 → 1.0.10 - Mend

cui-llama.rn 1.0.7 → 1.0.10

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (21) hide show

package/cpp/llama.h CHANGED Viewed

@@ -93,15 +93,14 @@ extern "C" {
         LLAMA_VOCAB_PRE_TYPE_TEKKEN         = 20,
         LLAMA_VOCAB_PRE_TYPE_SMOLLM         = 21,
         LLAMA_VOCAB_PRE_TYPE_CODESHELL      = 22,
+        LLAMA_VOCAB_PRE_TYPE_BLOOM          = 23,
+        LLAMA_VOCAB_PRE_TYPE_GPT3_FINNISH   = 24,
     };
-    // note: these values should be synchronized with lm_ggml_rope
-    // TODO: maybe move this enum to ggml.h (lm_ggml_rope_type)
     enum llama_rope_type {
         LLAMA_ROPE_TYPE_NONE = -1,
-        LLAMA_ROPE_TYPE_NORM =  0,
-        LLAMA_ROPE_TYPE_NEOX =  2,
-        LLAMA_ROPE_TYPE_GLM  =  4,
+        LLAMA_ROPE_TYPE_NORM = 0,
+        LLAMA_ROPE_TYPE_NEOX = LM_GGML_ROPE_TYPE_NEOX,
     };
     enum llama_token_type { //TODO: remove, required until per token attributes are available from GGUF file
@@ -345,7 +344,7 @@ extern "C" {
         int32_t nthread;                     // number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency()
         enum llama_ftype ftype;              // quantize to this llama_ftype
         enum lm_ggml_type output_tensor_type;   // output tensor type
-        enum lm_ggml_type token_embedding_type; // itoken embeddings tensor type
+        enum lm_ggml_type token_embedding_type; // token embeddings tensor type
         bool allow_requantize;               // allow quantizing non-f32/f16 tensors
         bool quantize_output_tensor;         // quantize output.weight
         bool only_copy;                      // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
@@ -504,6 +503,9 @@ extern "C" {
     // Returns true if the model contains an encoder that requires llama_encode() call
     LLAMA_API bool llama_model_has_encoder(const struct llama_model * model);
+    // Returns true if the model contains a decoder that requires llama_decode() call
+    LLAMA_API bool llama_model_has_decoder(const struct llama_model * model);
     // For encoder-decoder models, this function returns id of the token that must be provided
     // to the decoder to start generating output sequence. For other models, it returns -1.
     LLAMA_API llama_token llama_model_decoder_start_token(const struct llama_model * model);
@@ -912,11 +914,8 @@ extern "C" {
     LLAMA_API llama_token llama_token_nl (const struct llama_model * model); // next-line
     LLAMA_API llama_token llama_token_pad(const struct llama_model * model); // padding
-    // Returns -1 if unknown, 1 for true or 0 for false.
-    LLAMA_API int32_t llama_add_bos_token(const struct llama_model * model);
-    // Returns -1 if unknown, 1 for true or 0 for false.
-    LLAMA_API int32_t llama_add_eos_token(const struct llama_model * model);
+    LLAMA_API bool llama_add_bos_token(const struct llama_model * model);
+    LLAMA_API bool llama_add_eos_token(const struct llama_model * model);
     // Codellama infill tokens
     LLAMA_API llama_token llama_token_prefix(const struct llama_model * model); // Beginning of infill prefix

package/cpp/rn-llama.hpp CHANGED Viewed

@@ -6,13 +6,10 @@
 #include "common.h"
 #include "llama.h"
 #include <android/log.h>
 #define LLAMA_ANDROID_TAG "RNLLAMA_LOG_ANDROID"
 #define LLAMA_LOG_INFO(...)  __android_log_print(ANDROID_LOG_INFO , LLAMA_ANDROID_TAG, __VA_ARGS__)
 namespace rnllama {
 static void llama_batch_clear(llama_batch *batch) {
@@ -227,7 +224,9 @@ struct llama_rn_context
     bool loadModel(gpt_params &params_)
     {
         params = params_;
-        std::tie(model, ctx) = llama_init_from_gpt_params(params);
+        llama_init_result result = llama_init_from_gpt_params(params);
+        model = result.model;
+        ctx = result.context;
         if (model == nullptr)
         {
            LOG_ERROR("unable to load model: %s", params_.model.c_str());
@@ -298,7 +297,9 @@ struct llama_rn_context
         }
         // do Context Shift , may be buggy! TODO: Verify functionality
-        purge_missing_tokens(ctx, embd, prompt_tokens, params.n_predict, params.n_ctx);
+        if(!params.embedding){
+            purge_missing_tokens(ctx, embd, prompt_tokens, params.n_predict, params.n_ctx);
+        }
         // push the prompt into the sampling context (do not apply grammar)
         for (auto & token : prompt_tokens)
@@ -306,7 +307,7 @@ struct llama_rn_context
            llama_sampling_accept(ctx_sampling, ctx, token, false);
         }
         // compare the evaluated prompt with the new prompt
-        n_past = common_part(embd, prompt_tokens);
+        n_past = params.embedding? 0 :  common_part(embd, prompt_tokens);
         LLAMA_LOG_INFO("%s:        n_past: %zu", __func__,  n_past);
         LLAMA_LOG_INFO("%s:        embd size: %zu", __func__,  embd.size());
         LLAMA_LOG_INFO("%s:        prompt_tokens size: %zu", __func__,  prompt_tokens.size());
@@ -343,9 +344,9 @@ struct llama_rn_context
         completion_token_output result;
         result.tok = -1;
+        // this truncation should never trigger with good context shifting
         if (embd.size() >= (size_t)params.n_ctx)
         {
-            // Shift context
             const int n_left    = n_past - params.n_keep - 1;
             const int n_discard = n_left/2;
@@ -547,9 +548,21 @@ struct llama_rn_context
             LOG_WARNING("embedding disabled, embedding: %s", params.embedding);
             return std::vector<float>(n_embd, 0.0f);
         }
-        const float *data = llama_get_embeddings(ctx);
-        std::vector<float> embedding(data, data + n_embd);
-        return embedding;
+        float *data;
+        if(params.pooling_type == 0){
+            data = llama_get_embeddings(ctx);
+        }
+        else {
+            data = llama_get_embeddings_seq(ctx, 0);
+        }
+        if(!data) {
+            return std::vector<float>(n_embd, 0.0f);
+        }
+        std::vector<float> embedding(data, data + n_embd), out(data, data + n_embd);
+        llama_embd_normalize(embedding.data(), out.data(), n_embd, params.embd_normalize);
+        return out;
     }
     std::string bench(int pp, int tg, int pl, int nr)

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "cui-llama.rn",
-  "version": "1.0.7",
+  "version": "1.0.10",
   "description": "Fork of llama.rn for ChatterUI",
   "main": "lib/commonjs/index",
   "module": "lib/module/index",