cui-llama.rn 1.0.1 → 1.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -1,6 +1,15 @@
1
1
  # cui-llama.rn
2
2
 
3
- This is a fork of llama.rn meant for ChatterUI
3
+ This is a fork of [llama.rn](https://github.com/mybigday/llama.rn) meant for [ChatterUI](https://github.com/Vali-98/ChatterUI)
4
+
5
+ This fork exists to update llama.cpp on a more frequent basis, plus adding useful features to ChatterUI.
6
+
7
+ The following features have been added for Android:
8
+
9
+ - Updated sync for llama.cpp
10
+ - Added stopping prompt processing between batches, vital for mobile devices with very slow prompt processing
11
+ - `vocab_only` mode: utilize the llama.cpp tokenizer
12
+ - tokenizeSync: non-blocking, synchronous tokenizer function
4
13
 
5
14
  Original repo README.md below.
6
15
 
@@ -23,13 +23,14 @@ set(
23
23
  ${RNLLAMA_LIB_DIR}/unicode.cpp
24
24
  ${RNLLAMA_LIB_DIR}/llama.cpp
25
25
  ${RNLLAMA_LIB_DIR}/sgemm.cpp
26
+ ${RNLLAMA_LIB_DIR}/ggml-aarch64.c
26
27
  ${RNLLAMA_LIB_DIR}/rn-llama.hpp
27
28
  ${CMAKE_SOURCE_DIR}/jni.cpp
28
29
  )
29
30
 
30
31
  find_library(LOG_LIB log)
31
32
 
32
- function(build_library target_name)
33
+ function(build_library target_name cpu_flags)
33
34
  add_library(
34
35
  ${target_name}
35
36
  SHARED
@@ -38,32 +39,34 @@ function(build_library target_name)
38
39
 
39
40
  target_link_libraries(${target_name} ${LOG_LIB} android)
40
41
 
41
- target_compile_options(${target_name} PRIVATE -pthread)
42
-
43
- if (${target_name} STREQUAL "rnllama_v8fp16_va")
44
- target_compile_options(${target_name} PRIVATE -march=armv8.4-a+fp16+dotprod)
45
- endif ()
42
+ target_compile_options(${target_name} PRIVATE -pthread ${cpu_flags})
46
43
 
47
44
  if (${CMAKE_BUILD_TYPE} STREQUAL "Debug")
48
45
  target_compile_options(${target_name} PRIVATE -DRNLLAMA_ANDROID_ENABLE_LOGGING)
49
46
  endif ()
50
47
 
51
- # NOTE: If you want to debug the native code, you can uncomment if and endif
52
- # if (NOT ${CMAKE_BUILD_TYPE} STREQUAL "Debug")
48
+ #if (NOT ${CMAKE_BUILD_TYPE} STREQUAL "Debug")
49
+ target_compile_options(${target_name} PRIVATE -O3 -DNDEBUG)
50
+ target_compile_options(${target_name} PRIVATE -fvisibility=hidden -fvisibility-inlines-hidden)
51
+ target_compile_options(${target_name} PRIVATE -ffunction-sections -fdata-sections)
53
52
 
54
- target_compile_options(${target_name} PRIVATE -O3 -DNDEBUG)
55
- target_compile_options(${target_name} PRIVATE -fvisibility=hidden -fvisibility-inlines-hidden)
56
- target_compile_options(${target_name} PRIVATE -ffunction-sections -fdata-sections)
57
-
58
- target_link_options(${target_name} PRIVATE -Wl,--gc-sections)
59
- target_link_options(${target_name} PRIVATE -Wl,--exclude-libs,ALL)
60
- target_link_options(${target_name} PRIVATE -flto)
61
-
62
- # endif ()
53
+ target_link_options(${target_name} PRIVATE -Wl,--gc-sections)
54
+ target_link_options(${target_name} PRIVATE -Wl,--exclude-libs,ALL)
55
+ target_link_options(${target_name} PRIVATE -flto)
56
+ #endif ()
63
57
  endfunction()
64
58
 
65
- build_library("rnllama") # Default target
59
+ # Default target (no specific CPU features)
60
+ build_library("rnllama" "")
66
61
 
67
62
  if (${ANDROID_ABI} STREQUAL "arm64-v8a")
68
- build_library("rnllama_v8fp16_va")
63
+ # ARM64 targets
64
+ build_library("rnllama_v8_4_fp16_dotprod" "-march=armv8.4-a+fp16+dotprod")
65
+ build_library("rnllama_v8_2_fp16_dotprod" "-march=armv8.2-a+fp16+dotprod")
66
+ build_library("rnllama_v8_2_fp16" "-march=armv8.2-a+fp16")
67
+ build_library("rnllama_v8" "-march=armv8-a")
68
+ elseif (${ANDROID_ABI} STREQUAL "x86_64")
69
+ # x86_64 target
70
+ build_library("rnllama_x86_64" "-march=x86-64" "-mtune=intel" "-msse4.2" "-mpopcnt")
71
+
69
72
  endif ()
@@ -17,6 +17,7 @@ import java.io.BufferedReader;
17
17
  import java.io.FileReader;
18
18
  import java.io.File;
19
19
  import java.io.IOException;
20
+ import java.io.FileInputStream;
20
21
 
21
22
  public class LlamaContext {
22
23
  public static final String NAME = "RNLlamaContext";
@@ -28,6 +29,35 @@ public class LlamaContext {
28
29
  private int jobId = -1;
29
30
  private DeviceEventManagerModule.RCTDeviceEventEmitter eventEmitter;
30
31
 
32
+ private byte[] ggufHeader = {0x47, 0x47, 0x55, 0x46};
33
+
34
+ private boolean isGGUF(final String filepath) {
35
+ byte[] fileHeader = new byte[4];
36
+ FileInputStream fis = null;
37
+ try {
38
+ fis = new FileInputStream(filepath);
39
+ int bytesRead = fis.read(fileHeader);
40
+ if(bytesRead < 4) {
41
+ return false;
42
+ }
43
+ for(int i = 0; i < 4; i++){
44
+ if(fileHeader[i] != ggufHeader[i])
45
+ return false;
46
+ }
47
+ return true;
48
+ } catch (Exception e) {
49
+ return false;
50
+ }finally {
51
+ if (fis != null) {
52
+ try {
53
+ fis.close();
54
+ } catch (Exception e) {
55
+ Log.d(NAME, "Closing FileInputStream failed.");
56
+ }
57
+ }
58
+ }
59
+ }
60
+
31
61
  public LlamaContext(int id, ReactApplicationContext reactContext, ReadableMap params) {
32
62
  if (LlamaContext.isArm64V8a() == false && LlamaContext.isX86_64() == false) {
33
63
  throw new IllegalStateException("Only 64-bit architectures are supported");
@@ -35,6 +65,11 @@ public class LlamaContext {
35
65
  if (!params.hasKey("model")) {
36
66
  throw new IllegalArgumentException("Missing required parameter: model");
37
67
  }
68
+ // Check if file has GGUF magic numbers
69
+ if(!isGGUF(params.getString("model"))) {
70
+ throw new IllegalArgumentException("File is not in GGUF format");
71
+ }
72
+
38
73
  this.id = id;
39
74
  this.context = initContext(
40
75
  // String model,
@@ -239,28 +274,32 @@ public class LlamaContext {
239
274
  static {
240
275
  Log.d(NAME, "Primary ABI: " + Build.SUPPORTED_ABIS[0]);
241
276
  if (LlamaContext.isArm64V8a()) {
242
- boolean loadV8fp16 = false;
243
- if (LlamaContext.isArm64V8a()) {
244
- // ARMv8.2a needs runtime detection support
245
- String cpuInfo = LlamaContext.cpuInfo();
246
- if (cpuInfo != null) {
247
- Log.d(NAME, "CPU info: " + cpuInfo);
248
- if (cpuInfo.contains("fphp")) {
249
- Log.d(NAME, "CPU supports fp16 arithmetic");
250
- loadV8fp16 = true;
251
- }
252
- }
253
- }
277
+ String cpuFeatures = LlamaContext.getCpuFeatures();
278
+ Log.d(NAME, "CPU features: " + cpuFeatures);
254
279
 
255
- if (loadV8fp16) {
256
- Log.d(NAME, "Loading librnllama_v8fp16_va.so");
257
- System.loadLibrary("rnllama_v8fp16_va");
280
+ boolean hasFp16 = cpuFeatures.contains("fp16") || cpuFeatures.contains("fphp");
281
+ boolean hasDotProd = cpuFeatures.contains("dotprod") || cpuFeatures.contains("asimddp");
282
+ boolean isAtLeastArmV82 = cpuFeatures.contains("asimd") && cpuFeatures.contains("crc32") && cpuFeatures.contains("aes");
283
+ boolean isAtLeastArmV84 = cpuFeatures.contains("dcpop") && cpuFeatures.contains("uscat");
284
+
285
+ if (isAtLeastArmV84 && hasFp16 && hasDotProd) {
286
+ Log.d(NAME, "Loading librnllama_v8_4_fp16_dotprod.so");
287
+ System.loadLibrary("rnllama_v8_4_fp16_dotprod");
288
+ } else if (isAtLeastArmV82 && hasFp16 && hasDotProd) {
289
+ Log.d(NAME, "Loading librnllama_v8_2_fp16_dotprod.so");
290
+ System.loadLibrary("rnllama_v8_2_fp16_dotprod");
291
+ } else if (isAtLeastArmV82 && hasFp16) {
292
+ Log.d(NAME, "Loading librnllama_v8_2_fp16.so");
293
+ System.loadLibrary("rnllama_v8_2_fp16");
258
294
  } else {
259
- Log.d(NAME, "Loading librnllama.so");
260
- System.loadLibrary("rnllama");
295
+ Log.d(NAME, "Loading librnllama_v8.so");
296
+ System.loadLibrary("rnllama_v8");
261
297
  }
262
298
  } else if (LlamaContext.isX86_64()) {
263
- Log.d(NAME, "Loading librnllama.so");
299
+ Log.d(NAME, "Loading librnllama_x86_64.so");
300
+ System.loadLibrary("rnllama_x86_64");
301
+ } else {
302
+ Log.d(NAME, "Loading default librnllama.so");
264
303
  System.loadLibrary("rnllama");
265
304
  }
266
305
  }
@@ -273,20 +312,23 @@ public class LlamaContext {
273
312
  return Build.SUPPORTED_ABIS[0].equals("x86_64");
274
313
  }
275
314
 
276
- private static String cpuInfo() {
315
+ private static String getCpuFeatures() {
277
316
  File file = new File("/proc/cpuinfo");
278
317
  StringBuilder stringBuilder = new StringBuilder();
279
318
  try {
280
319
  BufferedReader bufferedReader = new BufferedReader(new FileReader(file));
281
320
  String line;
282
321
  while ((line = bufferedReader.readLine()) != null) {
322
+ if (line.startsWith("Features")) {
283
323
  stringBuilder.append(line);
324
+ break;
325
+ }
284
326
  }
285
327
  bufferedReader.close();
286
328
  return stringBuilder.toString();
287
329
  } catch (IOException e) {
288
330
  Log.w(NAME, "Couldn't read /proc/cpuinfo", e);
289
- return null;
331
+ return "";
290
332
  }
291
333
  }
292
334
 
package/cpp/common.cpp CHANGED
@@ -691,7 +691,6 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
691
691
  if (arg == "--lora") {
692
692
  CHECK_ARG
693
693
  params.lora_adapter.emplace_back(argv[i], 1.0f);
694
- params.use_mmap = false;
695
694
  return true;
696
695
  }
697
696
  if (arg == "--lora-scaled") {
@@ -699,7 +698,6 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
699
698
  const char* lora_adapter = argv[i];
700
699
  CHECK_ARG
701
700
  params.lora_adapter.emplace_back(lora_adapter, std::stof(argv[i]));
702
- params.use_mmap = false;
703
701
  return true;
704
702
  }
705
703
  if (arg == "--lora-base") {
@@ -2095,19 +2093,14 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
2095
2093
  for (unsigned int i = 0; i < params.lora_adapter.size(); ++i) {
2096
2094
  const std::string & lora_adapter = std::get<0>(params.lora_adapter[i]);
2097
2095
  float lora_scale = std::get<1>(params.lora_adapter[i]);
2098
- int err = llama_model_apply_lora_from_file(model,
2099
- lora_adapter.c_str(),
2100
- lora_scale,
2101
- ((i > 0) || params.lora_base.empty())
2102
- ? NULL
2103
- : params.lora_base.c_str(),
2104
- params.n_threads);
2105
- if (err != 0) {
2096
+ auto adapter = llama_lora_adapter_init(model, lora_adapter.c_str());
2097
+ if (adapter == nullptr) {
2106
2098
  fprintf(stderr, "%s: error: failed to apply lora adapter\n", __func__);
2107
2099
  llama_free(lctx);
2108
2100
  llama_free_model(model);
2109
2101
  return std::make_tuple(nullptr, nullptr);
2110
2102
  }
2103
+ llama_lora_adapter_set(lctx, adapter, lora_scale);
2111
2104
  }
2112
2105
 
2113
2106
  if (params.ignore_eos) {
@@ -2146,8 +2139,8 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
2146
2139
 
2147
2140
  struct llama_model_params llama_model_params_from_gpt_params(const gpt_params & params) {
2148
2141
  auto mparams = llama_model_default_params();
2142
+ mparams.vocab_only = params.vocab_only;
2149
2143
 
2150
- mparams.vocab_only = params.vocab_only;
2151
2144
  if (params.n_gpu_layers != -1) {
2152
2145
  mparams.n_gpu_layers = params.n_gpu_layers;
2153
2146
  }
package/cpp/common.h CHANGED
@@ -72,7 +72,7 @@ enum dimre_method {
72
72
  struct gpt_params {
73
73
  uint32_t seed = LLAMA_DEFAULT_SEED; // RNG seed
74
74
 
75
- bool vocab_only = false;
75
+ bool vocab_only = false;
76
76
  int32_t n_threads = cpu_get_num_math();
77
77
  int32_t n_threads_draft = -1;
78
78
  int32_t n_threads_batch = -1; // number of threads to use for batch processing (-1 = use n_threads)