cui-llama.rn 1.0.2 → 1.0.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -23,13 +23,14 @@ set(
23
23
  ${RNLLAMA_LIB_DIR}/unicode.cpp
24
24
  ${RNLLAMA_LIB_DIR}/llama.cpp
25
25
  ${RNLLAMA_LIB_DIR}/sgemm.cpp
26
+ ${RNLLAMA_LIB_DIR}/ggml-aarch64.c
26
27
  ${RNLLAMA_LIB_DIR}/rn-llama.hpp
27
28
  ${CMAKE_SOURCE_DIR}/jni.cpp
28
29
  )
29
30
 
30
31
  find_library(LOG_LIB log)
31
32
 
32
- function(build_library target_name)
33
+ function(build_library target_name cpu_flags)
33
34
  add_library(
34
35
  ${target_name}
35
36
  SHARED
@@ -38,32 +39,35 @@ function(build_library target_name)
38
39
 
39
40
  target_link_libraries(${target_name} ${LOG_LIB} android)
40
41
 
41
- target_compile_options(${target_name} PRIVATE -pthread)
42
-
43
- if (${target_name} STREQUAL "rnllama_v8fp16_va")
44
- target_compile_options(${target_name} PRIVATE -march=armv8.4-a+fp16+dotprod)
45
- endif ()
42
+ target_compile_options(${target_name} PRIVATE -pthread ${cpu_flags})
46
43
 
47
44
  if (${CMAKE_BUILD_TYPE} STREQUAL "Debug")
48
45
  target_compile_options(${target_name} PRIVATE -DRNLLAMA_ANDROID_ENABLE_LOGGING)
49
46
  endif ()
50
47
 
51
- # NOTE: If you want to debug the native code, you can uncomment if and endif
52
- # if (NOT ${CMAKE_BUILD_TYPE} STREQUAL "Debug")
48
+ #if (NOT ${CMAKE_BUILD_TYPE} STREQUAL "Debug")
49
+ target_compile_options(${target_name} PRIVATE -O3 -DNDEBUG)
50
+ target_compile_options(${target_name} PRIVATE -fvisibility=hidden -fvisibility-inlines-hidden)
51
+ target_compile_options(${target_name} PRIVATE -ffunction-sections -fdata-sections)
53
52
 
54
- target_compile_options(${target_name} PRIVATE -O3 -DNDEBUG)
55
- target_compile_options(${target_name} PRIVATE -fvisibility=hidden -fvisibility-inlines-hidden)
56
- target_compile_options(${target_name} PRIVATE -ffunction-sections -fdata-sections)
57
-
58
- target_link_options(${target_name} PRIVATE -Wl,--gc-sections)
59
- target_link_options(${target_name} PRIVATE -Wl,--exclude-libs,ALL)
60
- target_link_options(${target_name} PRIVATE -flto)
61
-
62
- # endif ()
53
+ target_link_options(${target_name} PRIVATE -Wl,--gc-sections)
54
+ target_link_options(${target_name} PRIVATE -Wl,--exclude-libs,ALL)
55
+ target_link_options(${target_name} PRIVATE -flto)
56
+ #endif ()
63
57
  endfunction()
64
58
 
65
- build_library("rnllama") # Default target
59
+ # Default target (no specific CPU features)
60
+ build_library("rnllama" "")
66
61
 
67
62
  if (${ANDROID_ABI} STREQUAL "arm64-v8a")
68
- build_library("rnllama_v8fp16_va")
63
+ # ARM64 targets
64
+ build_library("rnllama_v8_4_fp16_dotprod_i8mm" "-march=armv8.4-a+fp16+dotprod+i8mm")
65
+ build_library("rnllama_v8_4_fp16_dotprod" "-march=armv8.4-a+fp16+dotprod")
66
+ build_library("rnllama_v8_2_fp16_dotprod" "-march=armv8.2-a+fp16+dotprod")
67
+ build_library("rnllama_v8_2_fp16" "-march=armv8.2-a+fp16")
68
+ build_library("rnllama_v8" "-march=armv8-a")
69
+ elseif (${ANDROID_ABI} STREQUAL "x86_64")
70
+ # x86_64 target
71
+ build_library("rnllama_x86_64" "-march=x86-64" "-mtune=intel" "-msse4.2" "-mpopcnt")
72
+
69
73
  endif ()
@@ -274,28 +274,36 @@ public class LlamaContext {
274
274
  static {
275
275
  Log.d(NAME, "Primary ABI: " + Build.SUPPORTED_ABIS[0]);
276
276
  if (LlamaContext.isArm64V8a()) {
277
- boolean loadV8fp16 = false;
278
- if (LlamaContext.isArm64V8a()) {
279
- // ARMv8.2a needs runtime detection support
280
- String cpuInfo = LlamaContext.cpuInfo();
281
- if (cpuInfo != null) {
282
- Log.d(NAME, "CPU info: " + cpuInfo);
283
- if (cpuInfo.contains("fphp")) {
284
- Log.d(NAME, "CPU supports fp16 arithmetic");
285
- loadV8fp16 = true;
286
- }
287
- }
288
- }
289
-
290
- if (loadV8fp16) {
291
- Log.d(NAME, "Loading librnllama_v8fp16_va.so");
292
- System.loadLibrary("rnllama_v8fp16_va");
277
+ String cpuFeatures = LlamaContext.getCpuFeatures();
278
+ Log.d(NAME, "CPU features: " + cpuFeatures);
279
+
280
+ boolean hasFp16 = cpuFeatures.contains("fp16") || cpuFeatures.contains("fphp");
281
+ boolean hasDotProd = cpuFeatures.contains("dotprod") || cpuFeatures.contains("asimddp");
282
+ boolean isAtLeastArmV82 = cpuFeatures.contains("asimd") && cpuFeatures.contains("crc32") && cpuFeatures.contains("aes");
283
+ boolean isAtLeastArmV84 = cpuFeatures.contains("dcpop") && cpuFeatures.contains("uscat");
284
+ boolean hasInt8Matmul = cpuFeatures.contains("i8mm");
285
+
286
+ if (isAtLeastArmV84 && hasFp16 && hasDotProd && hasInt8Matmul) {
287
+ Log.d(NAME, "Loading librnllama_v8_4_fp16_dotprod_i8mm.so");
288
+ System.loadLibrary("rnllama_v8_4_fp16_dotprod_i8mm");
289
+ } else if (isAtLeastArmV84 && hasFp16 && hasDotProd) {
290
+ Log.d(NAME, "Loading librnllama_v8_4_fp16_dotprod.so");
291
+ System.loadLibrary("rnllama_v8_4_fp16_dotprod");
292
+ } else if (isAtLeastArmV82 && hasFp16 && hasDotProd) {
293
+ Log.d(NAME, "Loading librnllama_v8_2_fp16_dotprod.so");
294
+ System.loadLibrary("rnllama_v8_2_fp16_dotprod");
295
+ } else if (isAtLeastArmV82 && hasFp16) {
296
+ Log.d(NAME, "Loading librnllama_v8_2_fp16.so");
297
+ System.loadLibrary("rnllama_v8_2_fp16");
293
298
  } else {
294
- Log.d(NAME, "Loading librnllama.so");
295
- System.loadLibrary("rnllama");
299
+ Log.d(NAME, "Loading librnllama_v8.so");
300
+ System.loadLibrary("rnllama_v8");
296
301
  }
297
302
  } else if (LlamaContext.isX86_64()) {
298
- Log.d(NAME, "Loading librnllama.so");
303
+ Log.d(NAME, "Loading librnllama_x86_64.so");
304
+ System.loadLibrary("rnllama_x86_64");
305
+ } else {
306
+ Log.d(NAME, "Loading default librnllama.so");
299
307
  System.loadLibrary("rnllama");
300
308
  }
301
309
  }
@@ -308,20 +316,23 @@ public class LlamaContext {
308
316
  return Build.SUPPORTED_ABIS[0].equals("x86_64");
309
317
  }
310
318
 
311
- private static String cpuInfo() {
319
+ private static String getCpuFeatures() {
312
320
  File file = new File("/proc/cpuinfo");
313
321
  StringBuilder stringBuilder = new StringBuilder();
314
322
  try {
315
323
  BufferedReader bufferedReader = new BufferedReader(new FileReader(file));
316
324
  String line;
317
325
  while ((line = bufferedReader.readLine()) != null) {
326
+ if (line.startsWith("Features")) {
318
327
  stringBuilder.append(line);
328
+ break;
329
+ }
319
330
  }
320
331
  bufferedReader.close();
321
332
  return stringBuilder.toString();
322
333
  } catch (IOException e) {
323
334
  Log.w(NAME, "Couldn't read /proc/cpuinfo", e);
324
- return null;
335
+ return "";
325
336
  }
326
337
  }
327
338
 
package/cpp/common.cpp CHANGED
@@ -691,7 +691,6 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
691
691
  if (arg == "--lora") {
692
692
  CHECK_ARG
693
693
  params.lora_adapter.emplace_back(argv[i], 1.0f);
694
- params.use_mmap = false;
695
694
  return true;
696
695
  }
697
696
  if (arg == "--lora-scaled") {
@@ -699,7 +698,6 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
699
698
  const char* lora_adapter = argv[i];
700
699
  CHECK_ARG
701
700
  params.lora_adapter.emplace_back(lora_adapter, std::stof(argv[i]));
702
- params.use_mmap = false;
703
701
  return true;
704
702
  }
705
703
  if (arg == "--lora-base") {
@@ -2095,19 +2093,14 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
2095
2093
  for (unsigned int i = 0; i < params.lora_adapter.size(); ++i) {
2096
2094
  const std::string & lora_adapter = std::get<0>(params.lora_adapter[i]);
2097
2095
  float lora_scale = std::get<1>(params.lora_adapter[i]);
2098
- int err = llama_model_apply_lora_from_file(model,
2099
- lora_adapter.c_str(),
2100
- lora_scale,
2101
- ((i > 0) || params.lora_base.empty())
2102
- ? NULL
2103
- : params.lora_base.c_str(),
2104
- params.n_threads);
2105
- if (err != 0) {
2096
+ auto adapter = llama_lora_adapter_init(model, lora_adapter.c_str());
2097
+ if (adapter == nullptr) {
2106
2098
  fprintf(stderr, "%s: error: failed to apply lora adapter\n", __func__);
2107
2099
  llama_free(lctx);
2108
2100
  llama_free_model(model);
2109
2101
  return std::make_tuple(nullptr, nullptr);
2110
2102
  }
2103
+ llama_lora_adapter_set(lctx, adapter, lora_scale);
2111
2104
  }
2112
2105
 
2113
2106
  if (params.ignore_eos) {
@@ -2146,8 +2139,8 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
2146
2139
 
2147
2140
  struct llama_model_params llama_model_params_from_gpt_params(const gpt_params & params) {
2148
2141
  auto mparams = llama_model_default_params();
2142
+ mparams.vocab_only = params.vocab_only;
2149
2143
 
2150
- mparams.vocab_only = params.vocab_only;
2151
2144
  if (params.n_gpu_layers != -1) {
2152
2145
  mparams.n_gpu_layers = params.n_gpu_layers;
2153
2146
  }
package/cpp/common.h CHANGED
@@ -72,7 +72,7 @@ enum dimre_method {
72
72
  struct gpt_params {
73
73
  uint32_t seed = LLAMA_DEFAULT_SEED; // RNG seed
74
74
 
75
- bool vocab_only = false;
75
+ bool vocab_only = false;
76
76
  int32_t n_threads = cpu_get_num_math();
77
77
  int32_t n_threads_draft = -1;
78
78
  int32_t n_threads_batch = -1; // number of threads to use for batch processing (-1 = use n_threads)