npm - cui-llama.rn - Versions diffs - 1.0.6 → 1.0.7 - Mend

cui-llama.rn 1.0.6 → 1.0.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (10) hide show

package/README.md CHANGED Viewed

@@ -10,6 +10,7 @@ The following features have been added for Android:
 - Added stopping prompt processing between batches, vital for mobile devices with very slow prompt processing
 - `vocab_only` mode: utilize the llama.cpp tokenizer
 - tokenizeSync: non-blocking, synchronous tokenizer function
+- Context Shift taken from [kobold.cpp](https://github.com/LostRuins/koboldcpp)
 Original repo README.md below.

package/android/src/main/jni.cpp CHANGED Viewed

@@ -211,7 +211,7 @@ Java_com_rnllama_LlamaContext_loadModelDetails(
     for (int i = 0; i < count; i++) {
         char key[256];
         llama_model_meta_key_by_index(llama->model, i, key, sizeof(key));
-        char val[256];
+        char val[2048];
         llama_model_meta_val_str_by_index(llama->model, i, val, sizeof(val));
         putString(env, meta, key, val);

package/cpp/common.cpp CHANGED Viewed

@@ -1640,7 +1640,7 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
     options.push_back({ "server",      "       --host HOST",            "ip address to listen (default: %s)", params.hostname.c_str() });
     options.push_back({ "server",      "       --port PORT",            "port to listen (default: %d)", params.port });
     options.push_back({ "server",      "       --path PATH",            "path to serve static files from (default: %s)", params.public_path.c_str() });
-    options.push_back({ "server",      "       --embedding(s)",         "enable embedding endpoint (default: %s)", params.embedding ? "enabled" : "disabled" });
+    options.push_back({ "server",      "       --embedding(s)",         "restrict to only support embedding use case; use only with dedicated embedding models (default: %s)", params.embedding ? "enabled" : "disabled" });
     options.push_back({ "server",      "       --api-key KEY",          "API key to use for authentication (default: none)" });
     options.push_back({ "server",      "       --api-key-file FNAME",   "path to file containing API keys (default: none)" });
     options.push_back({ "server",      "       --ssl-key-file FNAME",   "path to file a PEM-encoded SSL private key" });

package/cpp/ggml-quants.c CHANGED Viewed

@@ -6449,22 +6449,22 @@ void lm_ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, size_t bs, const void
             // compute mask for subtraction
             vuint8m1_t qh_m0 = __riscv_vand_vx_u8m1(vqh, m, vl);
             vbool8_t vmask_0 = __riscv_vmseq_vx_u8m1_b8(qh_m0, 0, vl);
-            vint8m1_t q3_m0 = __riscv_vsub_vx_i8m1_m(vmask_0, q3_0, 0x4, vl);
+            vint8m1_t q3_m0 = __riscv_vsub_vx_i8m1_mu(vmask_0, q3_0, q3_0, 0x4, vl);
             m <<= 1;
             vuint8m1_t qh_m1 = __riscv_vand_vx_u8m1(vqh, m, vl);
             vbool8_t vmask_1 = __riscv_vmseq_vx_u8m1_b8(qh_m1, 0, vl);
-            vint8m1_t q3_m1 = __riscv_vsub_vx_i8m1_m(vmask_1, q3_1, 0x4, vl);
+            vint8m1_t q3_m1 = __riscv_vsub_vx_i8m1_mu(vmask_1, q3_1, q3_1, 0x4, vl);
             m <<= 1;
             vuint8m1_t qh_m2 = __riscv_vand_vx_u8m1(vqh, m, vl);
             vbool8_t vmask_2 = __riscv_vmseq_vx_u8m1_b8(qh_m2, 0, vl);
-            vint8m1_t q3_m2 = __riscv_vsub_vx_i8m1_m(vmask_2, q3_2, 0x4, vl);
+            vint8m1_t q3_m2 = __riscv_vsub_vx_i8m1_mu(vmask_2, q3_2, q3_2, 0x4, vl);
             m <<= 1;
             vuint8m1_t qh_m3 = __riscv_vand_vx_u8m1(vqh, m, vl);
             vbool8_t vmask_3 = __riscv_vmseq_vx_u8m1_b8(qh_m3, 0, vl);
-            vint8m1_t q3_m3 = __riscv_vsub_vx_i8m1_m(vmask_3, q3_3, 0x4, vl);
+            vint8m1_t q3_m3 = __riscv_vsub_vx_i8m1_mu(vmask_3, q3_3, q3_3, 0x4, vl);
             m <<= 1;
             // load Q8 and take product with Q3
@@ -7720,13 +7720,13 @@ void lm_ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void
             vint8m1_t q5_a = __riscv_vreinterpret_v_u8m1_i8m1(__riscv_vand_vx_u8m1(q5_x, 0x0F, vl));
             vuint8m1_t qh_m1 = __riscv_vand_vx_u8m1(vqh, m, vl);
             vbool8_t vmask_1 = __riscv_vmsne_vx_u8m1_b8(qh_m1, 0, vl);
-            vint8m1_t q5_m1 = __riscv_vadd_vx_i8m1_m(vmask_1, q5_a, 16, vl);
+            vint8m1_t q5_m1 = __riscv_vadd_vx_i8m1_mu(vmask_1, q5_a, q5_a, 16, vl);
             m <<= 1;
             vint8m1_t q5_l = __riscv_vreinterpret_v_u8m1_i8m1(__riscv_vsrl_vx_u8m1(q5_x, 0x04, vl));
             vuint8m1_t qh_m2 = __riscv_vand_vx_u8m1(vqh, m, vl);
             vbool8_t vmask_2 = __riscv_vmsne_vx_u8m1_b8(qh_m2, 0, vl);
-            vint8m1_t q5_m2 = __riscv_vadd_vx_i8m1_m(vmask_2, q5_l, 16, vl);
+            vint8m1_t q5_m2 = __riscv_vadd_vx_i8m1_mu(vmask_2, q5_l, q5_l, 16, vl);
             m <<= 1;
             vint16m2_t v0 = __riscv_vwmul_vv_i16m2(q5_m1, q8_y1, vl);

package/cpp/ggml.c CHANGED Viewed

@@ -141,7 +141,51 @@ typedef pthread_t lm_ggml_thread_t;
 #include <sys/wait.h>
-#if defined(__linux__)
+#if defined(__ANDROID__)
+#include <unwind.h>
+#include <dlfcn.h>
+#include <stdio.h>
+struct backtrace_state {
+    void ** current;
+    void ** end;
+};
+static _Unwind_Reason_Code unwind_callback(struct _Unwind_Context* context, void* arg) {
+    struct backtrace_state * state = (struct backtrace_state *)arg;
+    uintptr_t pc = _Unwind_GetIP(context);
+    if (pc) {
+        if (state->current == state->end) {
+            return _URC_END_OF_STACK;
+        } else {
+            *state->current++ = (void*)pc;
+        }
+    }
+    return _URC_NO_REASON;
+}
+static void lm_ggml_print_backtrace_symbols(void) {
+    const int max = 100;
+    void* buffer[max];
+    struct backtrace_state state = {buffer, buffer + max};
+    _Unwind_Backtrace(unwind_callback, &state);
+    int count = state.current - buffer;
+    for (int idx = 0; idx < count; ++idx) {
+        const void * addr = buffer[idx];
+        const char * symbol = "";
+        Dl_info info;
+        if (dladdr(addr, &info) && info.dli_sname) {
+            symbol = info.dli_sname;
+        }
+        fprintf(stderr, "%d: %p %s\n", idx, addr, symbol);
+    }
+}
+#elif defined(__linux__)
 #include <execinfo.h>
 static void lm_ggml_print_backtrace_symbols(void) {
     // void * trace[100];

package/cpp/llama.cpp CHANGED Viewed

@@ -4980,6 +4980,7 @@ static void llm_load_hparams(
                 hparams.attn_soft_cap = true;
                 switch (hparams.n_layer) {
+                    case 26: model.type = e_model::MODEL_2B; break;
                     case 42: model.type = e_model::MODEL_9B; break;
                     case 46: model.type = e_model::MODEL_27B; break;
                     default: model.type = e_model::MODEL_UNKNOWN;
@@ -11747,6 +11748,7 @@ struct llm_build_context {
                 // ref: https://github.com/google/gemma_pytorch/commit/03e657582d17cb5a8617ebf333c1c16f3694670e
                 switch (model.type) {
+                    case e_model::MODEL_2B:
                     case e_model::MODEL_9B:  Qcur = lm_ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd_head_k)));   break;
                     case e_model::MODEL_27B: Qcur = lm_ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd / n_head))); break;
                     default: LM_GGML_ABORT("fatal error");

package/cpp/rn-llama.hpp CHANGED Viewed

@@ -240,9 +240,11 @@ struct llama_rn_context
     bool validateModelChatTemplate() const {
         llama_chat_message chat[] = {{"user", "test"}};
-        const int res = llama_chat_apply_template(model, nullptr, chat, 1, true, nullptr, 0);
+        std::vector<char> model_template(2048, 0); // longest known template is about 1200 bytes
+        std::string template_key = "tokenizer.chat_template";
+        int32_t res = llama_model_meta_val_str(model, template_key.c_str(), model_template.data(), model_template.size());
-        return res > 0;
+        return res >= 0;
     }
     void truncatePrompt(std::vector<llama_token> &prompt_tokens) {

package/ios/RNLlamaContext.mm CHANGED Viewed

@@ -102,7 +102,7 @@
     for (int i = 0; i < count; i++) {
         char key[256];
         llama_model_meta_key_by_index(llama->model, i, key, sizeof(key));
-        char val[256];
+        char val[2048];
         llama_model_meta_val_str_by_index(llama->model, i, val, sizeof(val));
         NSString *keyStr = [NSString stringWithUTF8String:key];

package/jest/mock.js CHANGED Viewed

@@ -10,6 +10,9 @@ if (!NativeModules.RNLlama) {
       }),
     ),
+    // TODO: Use jinja parser
+    getFormattedChat: jest.fn(() => ''),
     completion: jest.fn(async (contextId, jobId) => {
       const testResult = {
         text: '*giggles*',

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "cui-llama.rn",
-  "version": "1.0.6",
+  "version": "1.0.7",
   "description": "Fork of llama.rn for ChatterUI",
   "main": "lib/commonjs/index",
   "module": "lib/module/index",