npm - cui-llama.rn - Versions diffs - 1.0.1 → 1.0.3 - Mend

cui-llama.rn 1.0.1 → 1.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (27) hide show

package/README.md +10 -1
package/android/src/main/CMakeLists.txt +22 -19
package/android/src/main/java/com/rnllama/LlamaContext.java +62 -20
package/cpp/common.cpp +4 -11
package/cpp/common.h +1 -1
package/cpp/ggml-aarch64.c +2193 -2193
package/cpp/ggml-aarch64.h +39 -39
package/cpp/ggml-alloc.c +1042 -1041
package/cpp/ggml-backend-impl.h +153 -153
package/cpp/ggml-backend.c +2234 -2225
package/cpp/ggml-backend.h +238 -236
package/cpp/ggml-common.h +1829 -1829
package/cpp/ggml-impl.h +655 -655
package/cpp/ggml-metal.h +65 -65
package/cpp/ggml-metal.m +3269 -3273
package/cpp/ggml-quants.c +14860 -15022
package/cpp/ggml-quants.h +132 -132
package/cpp/ggml.c +16 -6
package/cpp/ggml.h +2447 -2444
package/cpp/llama.cpp +634 -531
package/cpp/llama.h +30 -14
package/cpp/log.h +737 -737
package/cpp/rn-llama.hpp +9 -1
package/cpp/sampling.cpp +460 -460
package/cpp/sgemm.cpp +1027 -1027
package/cpp/sgemm.h +14 -14
package/package.json +1 -1

package/README.md CHANGED Viewed

@@ -1,6 +1,15 @@
 # cui-llama.rn
-This is a fork of llama.rn meant for ChatterUI
+This is a fork of [llama.rn](https://github.com/mybigday/llama.rn) meant for [ChatterUI](https://github.com/Vali-98/ChatterUI)
+This fork exists to update llama.cpp on a more frequent basis, plus adding useful features to ChatterUI.
+The following features have been added for Android:
+- Updated sync for llama.cpp
+- Added stopping prompt processing between batches, vital for mobile devices with very slow prompt processing
+- `vocab_only` mode: utilize the llama.cpp tokenizer
+- tokenizeSync: non-blocking, synchronous tokenizer function
 Original repo README.md below.

package/android/src/main/CMakeLists.txt CHANGED Viewed

@@ -23,13 +23,14 @@ set(
     ${RNLLAMA_LIB_DIR}/unicode.cpp
     ${RNLLAMA_LIB_DIR}/llama.cpp
     ${RNLLAMA_LIB_DIR}/sgemm.cpp
+    ${RNLLAMA_LIB_DIR}/ggml-aarch64.c
     ${RNLLAMA_LIB_DIR}/rn-llama.hpp
     ${CMAKE_SOURCE_DIR}/jni.cpp
 )
 find_library(LOG_LIB log)
-function(build_library target_name)
+function(build_library target_name cpu_flags)
     add_library(
         ${target_name}
         SHARED
@@ -38,32 +39,34 @@ function(build_library target_name)
     target_link_libraries(${target_name} ${LOG_LIB} android)
-    target_compile_options(${target_name} PRIVATE -pthread)
-    if (${target_name} STREQUAL "rnllama_v8fp16_va")
-        target_compile_options(${target_name} PRIVATE -march=armv8.4-a+fp16+dotprod)
-    endif ()
+    target_compile_options(${target_name} PRIVATE -pthread ${cpu_flags})
     if (${CMAKE_BUILD_TYPE} STREQUAL "Debug")
         target_compile_options(${target_name} PRIVATE -DRNLLAMA_ANDROID_ENABLE_LOGGING)
     endif ()
-    # NOTE: If you want to debug the native code, you can uncomment if and endif
-    # if (NOT ${CMAKE_BUILD_TYPE} STREQUAL "Debug")
+    #if (NOT ${CMAKE_BUILD_TYPE} STREQUAL "Debug")
+        target_compile_options(${target_name} PRIVATE -O3 -DNDEBUG)
+        target_compile_options(${target_name} PRIVATE -fvisibility=hidden -fvisibility-inlines-hidden)
+        target_compile_options(${target_name} PRIVATE -ffunction-sections -fdata-sections)
-    target_compile_options(${target_name} PRIVATE -O3 -DNDEBUG)
-    target_compile_options(${target_name} PRIVATE -fvisibility=hidden -fvisibility-inlines-hidden)
-    target_compile_options(${target_name} PRIVATE -ffunction-sections -fdata-sections)
-    target_link_options(${target_name} PRIVATE -Wl,--gc-sections)
-    target_link_options(${target_name} PRIVATE -Wl,--exclude-libs,ALL)
-    target_link_options(${target_name} PRIVATE -flto)
-    # endif ()
+        target_link_options(${target_name} PRIVATE -Wl,--gc-sections)
+        target_link_options(${target_name} PRIVATE -Wl,--exclude-libs,ALL)
+        target_link_options(${target_name} PRIVATE -flto)
+    #endif ()
 endfunction()
-build_library("rnllama") # Default target
+# Default target (no specific CPU features)
+build_library("rnllama" "")
 if (${ANDROID_ABI} STREQUAL "arm64-v8a")
-    build_library("rnllama_v8fp16_va")
+    # ARM64 targets
+    build_library("rnllama_v8_4_fp16_dotprod" "-march=armv8.4-a+fp16+dotprod")
+    build_library("rnllama_v8_2_fp16_dotprod" "-march=armv8.2-a+fp16+dotprod")
+    build_library("rnllama_v8_2_fp16" "-march=armv8.2-a+fp16")
+    build_library("rnllama_v8" "-march=armv8-a")
+elseif (${ANDROID_ABI} STREQUAL "x86_64")
+    # x86_64 target
+    build_library("rnllama_x86_64" "-march=x86-64" "-mtune=intel" "-msse4.2" "-mpopcnt")
 endif ()

package/android/src/main/java/com/rnllama/LlamaContext.java CHANGED Viewed

@@ -17,6 +17,7 @@ import java.io.BufferedReader;
 import java.io.FileReader;
 import java.io.File;
 import java.io.IOException;
+import java.io.FileInputStream;
 public class LlamaContext {
   public static final String NAME = "RNLlamaContext";
@@ -28,6 +29,35 @@ public class LlamaContext {
   private int jobId = -1;
   private DeviceEventManagerModule.RCTDeviceEventEmitter eventEmitter;
+  private byte[] ggufHeader = {0x47, 0x47, 0x55, 0x46};
+  private boolean isGGUF(final String filepath) {
+    byte[] fileHeader = new byte[4];
+    FileInputStream fis = null;
+    try {
+      fis = new FileInputStream(filepath);
+      int bytesRead = fis.read(fileHeader);
+      if(bytesRead < 4) {
+        return false;
+      }
+      for(int i = 0; i < 4; i++){
+        if(fileHeader[i] != ggufHeader[i])
+          return false;
+      }
+      return true;
+    } catch (Exception e) {
+      return false;
+    }finally {
+      if (fis != null) {
+          try {
+              fis.close();
+          } catch (Exception e) {
+              Log.d(NAME, "Closing FileInputStream failed.");
+          }
+      }
+    }
+  }
   public LlamaContext(int id, ReactApplicationContext reactContext, ReadableMap params) {
     if (LlamaContext.isArm64V8a() == false && LlamaContext.isX86_64() == false) {
       throw new IllegalStateException("Only 64-bit architectures are supported");
@@ -35,6 +65,11 @@ public class LlamaContext {
     if (!params.hasKey("model")) {
       throw new IllegalArgumentException("Missing required parameter: model");
     }
+    // Check if file has GGUF magic numbers
+    if(!isGGUF(params.getString("model"))) {
+      throw new IllegalArgumentException("File is not in GGUF format");
+    }
     this.id = id;
     this.context = initContext(
       // String model,
@@ -239,28 +274,32 @@ public class LlamaContext {
   static {
     Log.d(NAME, "Primary ABI: " + Build.SUPPORTED_ABIS[0]);
     if (LlamaContext.isArm64V8a()) {
-      boolean loadV8fp16 = false;
-      if (LlamaContext.isArm64V8a()) {
-        // ARMv8.2a needs runtime detection support
-        String cpuInfo = LlamaContext.cpuInfo();
-        if (cpuInfo != null) {
-          Log.d(NAME, "CPU info: " + cpuInfo);
-          if (cpuInfo.contains("fphp")) {
-            Log.d(NAME, "CPU supports fp16 arithmetic");
-            loadV8fp16 = true;
-          }
-        }
-      }
+      String cpuFeatures = LlamaContext.getCpuFeatures();
+      Log.d(NAME, "CPU features: " + cpuFeatures);
-      if (loadV8fp16) {
-        Log.d(NAME, "Loading librnllama_v8fp16_va.so");
-        System.loadLibrary("rnllama_v8fp16_va");
+      boolean hasFp16 = cpuFeatures.contains("fp16") || cpuFeatures.contains("fphp");
+      boolean hasDotProd = cpuFeatures.contains("dotprod") || cpuFeatures.contains("asimddp");
+      boolean isAtLeastArmV82 = cpuFeatures.contains("asimd") && cpuFeatures.contains("crc32") && cpuFeatures.contains("aes");
+      boolean isAtLeastArmV84 = cpuFeatures.contains("dcpop") && cpuFeatures.contains("uscat");
+      if (isAtLeastArmV84 && hasFp16 && hasDotProd) {
+        Log.d(NAME, "Loading librnllama_v8_4_fp16_dotprod.so");
+        System.loadLibrary("rnllama_v8_4_fp16_dotprod");
+      } else if (isAtLeastArmV82 && hasFp16 && hasDotProd) {
+        Log.d(NAME, "Loading librnllama_v8_2_fp16_dotprod.so");
+        System.loadLibrary("rnllama_v8_2_fp16_dotprod");
+      } else if (isAtLeastArmV82 && hasFp16) {
+        Log.d(NAME, "Loading librnllama_v8_2_fp16.so");
+        System.loadLibrary("rnllama_v8_2_fp16");
       } else {
-        Log.d(NAME, "Loading librnllama.so");
-        System.loadLibrary("rnllama");
+        Log.d(NAME, "Loading librnllama_v8.so");
+        System.loadLibrary("rnllama_v8");
       }
     } else if (LlamaContext.isX86_64()) {
-      Log.d(NAME, "Loading librnllama.so");
+      Log.d(NAME, "Loading librnllama_x86_64.so");
+      System.loadLibrary("rnllama_x86_64");
+    } else {
+      Log.d(NAME, "Loading default librnllama.so");
       System.loadLibrary("rnllama");
     }
   }
@@ -273,20 +312,23 @@ public class LlamaContext {
     return Build.SUPPORTED_ABIS[0].equals("x86_64");
   }
-  private static String cpuInfo() {
+  private static String getCpuFeatures() {
     File file = new File("/proc/cpuinfo");
     StringBuilder stringBuilder = new StringBuilder();
     try {
       BufferedReader bufferedReader = new BufferedReader(new FileReader(file));
       String line;
       while ((line = bufferedReader.readLine()) != null) {
+        if (line.startsWith("Features")) {
           stringBuilder.append(line);
+          break;
+        }
       }
       bufferedReader.close();
       return stringBuilder.toString();
     } catch (IOException e) {
       Log.w(NAME, "Couldn't read /proc/cpuinfo", e);
-      return null;
+      return "";
     }
   }

package/cpp/common.cpp CHANGED Viewed

@@ -691,7 +691,6 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
     if (arg == "--lora") {
         CHECK_ARG
         params.lora_adapter.emplace_back(argv[i], 1.0f);
-        params.use_mmap = false;
         return true;
     }
     if (arg == "--lora-scaled") {
@@ -699,7 +698,6 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
         const char* lora_adapter = argv[i];
         CHECK_ARG
         params.lora_adapter.emplace_back(lora_adapter, std::stof(argv[i]));
-        params.use_mmap = false;
         return true;
     }
     if (arg == "--lora-base") {
@@ -2095,19 +2093,14 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
     for (unsigned int i = 0; i < params.lora_adapter.size(); ++i) {
         const std::string & lora_adapter = std::get<0>(params.lora_adapter[i]);
         float lora_scale = std::get<1>(params.lora_adapter[i]);
-        int err = llama_model_apply_lora_from_file(model,
-                                             lora_adapter.c_str(),
-                                             lora_scale,
-                                             ((i > 0) || params.lora_base.empty())
-                                                ? NULL
-                                                : params.lora_base.c_str(),
-                                             params.n_threads);
-        if (err != 0) {
+        auto adapter = llama_lora_adapter_init(model, lora_adapter.c_str());
+        if (adapter == nullptr) {
             fprintf(stderr, "%s: error: failed to apply lora adapter\n", __func__);
             llama_free(lctx);
             llama_free_model(model);
             return std::make_tuple(nullptr, nullptr);
         }
+        llama_lora_adapter_set(lctx, adapter, lora_scale);
     }
     if (params.ignore_eos) {
@@ -2146,8 +2139,8 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
 struct llama_model_params llama_model_params_from_gpt_params(const gpt_params & params) {
     auto mparams = llama_model_default_params();
+   mparams.vocab_only      = params.vocab_only;
-    mparams.vocab_only      = params.vocab_only;
     if (params.n_gpu_layers != -1) {
         mparams.n_gpu_layers = params.n_gpu_layers;
     }

package/cpp/common.h CHANGED Viewed

@@ -72,7 +72,7 @@ enum dimre_method {
 struct gpt_params {
     uint32_t seed                 = LLAMA_DEFAULT_SEED; // RNG seed
-    bool vocab_only               = false;
+   bool vocab_only = false;
     int32_t n_threads             = cpu_get_num_math();
     int32_t n_threads_draft       =    -1;
     int32_t n_threads_batch       =    -1; // number of threads to use for batch processing (-1 = use n_threads)