cui-llama.rn 1.0.1 → 1.0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +10 -1
- package/android/src/main/CMakeLists.txt +22 -19
- package/android/src/main/java/com/rnllama/LlamaContext.java +62 -20
- package/cpp/common.cpp +4 -11
- package/cpp/common.h +1 -1
- package/cpp/ggml-aarch64.c +2193 -2193
- package/cpp/ggml-aarch64.h +39 -39
- package/cpp/ggml-alloc.c +1042 -1041
- package/cpp/ggml-backend-impl.h +153 -153
- package/cpp/ggml-backend.c +2234 -2225
- package/cpp/ggml-backend.h +238 -236
- package/cpp/ggml-common.h +1829 -1829
- package/cpp/ggml-impl.h +655 -655
- package/cpp/ggml-metal.h +65 -65
- package/cpp/ggml-metal.m +3269 -3273
- package/cpp/ggml-quants.c +14860 -15022
- package/cpp/ggml-quants.h +132 -132
- package/cpp/ggml.c +16 -6
- package/cpp/ggml.h +2447 -2444
- package/cpp/llama.cpp +634 -531
- package/cpp/llama.h +30 -14
- package/cpp/log.h +737 -737
- package/cpp/rn-llama.hpp +9 -1
- package/cpp/sampling.cpp +460 -460
- package/cpp/sgemm.cpp +1027 -1027
- package/cpp/sgemm.h +14 -14
- package/package.json +1 -1
package/README.md
CHANGED
@@ -1,6 +1,15 @@
|
|
1
1
|
# cui-llama.rn
|
2
2
|
|
3
|
-
This is a fork of llama.rn meant for ChatterUI
|
3
|
+
This is a fork of [llama.rn](https://github.com/mybigday/llama.rn) meant for [ChatterUI](https://github.com/Vali-98/ChatterUI)
|
4
|
+
|
5
|
+
This fork exists to update llama.cpp on a more frequent basis, plus adding useful features to ChatterUI.
|
6
|
+
|
7
|
+
The following features have been added for Android:
|
8
|
+
|
9
|
+
- Updated sync for llama.cpp
|
10
|
+
- Added stopping prompt processing between batches, vital for mobile devices with very slow prompt processing
|
11
|
+
- `vocab_only` mode: utilize the llama.cpp tokenizer
|
12
|
+
- tokenizeSync: non-blocking, synchronous tokenizer function
|
4
13
|
|
5
14
|
Original repo README.md below.
|
6
15
|
|
@@ -23,13 +23,14 @@ set(
|
|
23
23
|
${RNLLAMA_LIB_DIR}/unicode.cpp
|
24
24
|
${RNLLAMA_LIB_DIR}/llama.cpp
|
25
25
|
${RNLLAMA_LIB_DIR}/sgemm.cpp
|
26
|
+
${RNLLAMA_LIB_DIR}/ggml-aarch64.c
|
26
27
|
${RNLLAMA_LIB_DIR}/rn-llama.hpp
|
27
28
|
${CMAKE_SOURCE_DIR}/jni.cpp
|
28
29
|
)
|
29
30
|
|
30
31
|
find_library(LOG_LIB log)
|
31
32
|
|
32
|
-
function(build_library target_name)
|
33
|
+
function(build_library target_name cpu_flags)
|
33
34
|
add_library(
|
34
35
|
${target_name}
|
35
36
|
SHARED
|
@@ -38,32 +39,34 @@ function(build_library target_name)
|
|
38
39
|
|
39
40
|
target_link_libraries(${target_name} ${LOG_LIB} android)
|
40
41
|
|
41
|
-
target_compile_options(${target_name} PRIVATE -pthread)
|
42
|
-
|
43
|
-
if (${target_name} STREQUAL "rnllama_v8fp16_va")
|
44
|
-
target_compile_options(${target_name} PRIVATE -march=armv8.4-a+fp16+dotprod)
|
45
|
-
endif ()
|
42
|
+
target_compile_options(${target_name} PRIVATE -pthread ${cpu_flags})
|
46
43
|
|
47
44
|
if (${CMAKE_BUILD_TYPE} STREQUAL "Debug")
|
48
45
|
target_compile_options(${target_name} PRIVATE -DRNLLAMA_ANDROID_ENABLE_LOGGING)
|
49
46
|
endif ()
|
50
47
|
|
51
|
-
#
|
52
|
-
|
48
|
+
#if (NOT ${CMAKE_BUILD_TYPE} STREQUAL "Debug")
|
49
|
+
target_compile_options(${target_name} PRIVATE -O3 -DNDEBUG)
|
50
|
+
target_compile_options(${target_name} PRIVATE -fvisibility=hidden -fvisibility-inlines-hidden)
|
51
|
+
target_compile_options(${target_name} PRIVATE -ffunction-sections -fdata-sections)
|
53
52
|
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
target_link_options(${target_name} PRIVATE -Wl,--gc-sections)
|
59
|
-
target_link_options(${target_name} PRIVATE -Wl,--exclude-libs,ALL)
|
60
|
-
target_link_options(${target_name} PRIVATE -flto)
|
61
|
-
|
62
|
-
# endif ()
|
53
|
+
target_link_options(${target_name} PRIVATE -Wl,--gc-sections)
|
54
|
+
target_link_options(${target_name} PRIVATE -Wl,--exclude-libs,ALL)
|
55
|
+
target_link_options(${target_name} PRIVATE -flto)
|
56
|
+
#endif ()
|
63
57
|
endfunction()
|
64
58
|
|
65
|
-
|
59
|
+
# Default target (no specific CPU features)
|
60
|
+
build_library("rnllama" "")
|
66
61
|
|
67
62
|
if (${ANDROID_ABI} STREQUAL "arm64-v8a")
|
68
|
-
|
63
|
+
# ARM64 targets
|
64
|
+
build_library("rnllama_v8_4_fp16_dotprod" "-march=armv8.4-a+fp16+dotprod")
|
65
|
+
build_library("rnllama_v8_2_fp16_dotprod" "-march=armv8.2-a+fp16+dotprod")
|
66
|
+
build_library("rnllama_v8_2_fp16" "-march=armv8.2-a+fp16")
|
67
|
+
build_library("rnllama_v8" "-march=armv8-a")
|
68
|
+
elseif (${ANDROID_ABI} STREQUAL "x86_64")
|
69
|
+
# x86_64 target
|
70
|
+
build_library("rnllama_x86_64" "-march=x86-64" "-mtune=intel" "-msse4.2" "-mpopcnt")
|
71
|
+
|
69
72
|
endif ()
|
@@ -17,6 +17,7 @@ import java.io.BufferedReader;
|
|
17
17
|
import java.io.FileReader;
|
18
18
|
import java.io.File;
|
19
19
|
import java.io.IOException;
|
20
|
+
import java.io.FileInputStream;
|
20
21
|
|
21
22
|
public class LlamaContext {
|
22
23
|
public static final String NAME = "RNLlamaContext";
|
@@ -28,6 +29,35 @@ public class LlamaContext {
|
|
28
29
|
private int jobId = -1;
|
29
30
|
private DeviceEventManagerModule.RCTDeviceEventEmitter eventEmitter;
|
30
31
|
|
32
|
+
private byte[] ggufHeader = {0x47, 0x47, 0x55, 0x46};
|
33
|
+
|
34
|
+
private boolean isGGUF(final String filepath) {
|
35
|
+
byte[] fileHeader = new byte[4];
|
36
|
+
FileInputStream fis = null;
|
37
|
+
try {
|
38
|
+
fis = new FileInputStream(filepath);
|
39
|
+
int bytesRead = fis.read(fileHeader);
|
40
|
+
if(bytesRead < 4) {
|
41
|
+
return false;
|
42
|
+
}
|
43
|
+
for(int i = 0; i < 4; i++){
|
44
|
+
if(fileHeader[i] != ggufHeader[i])
|
45
|
+
return false;
|
46
|
+
}
|
47
|
+
return true;
|
48
|
+
} catch (Exception e) {
|
49
|
+
return false;
|
50
|
+
}finally {
|
51
|
+
if (fis != null) {
|
52
|
+
try {
|
53
|
+
fis.close();
|
54
|
+
} catch (Exception e) {
|
55
|
+
Log.d(NAME, "Closing FileInputStream failed.");
|
56
|
+
}
|
57
|
+
}
|
58
|
+
}
|
59
|
+
}
|
60
|
+
|
31
61
|
public LlamaContext(int id, ReactApplicationContext reactContext, ReadableMap params) {
|
32
62
|
if (LlamaContext.isArm64V8a() == false && LlamaContext.isX86_64() == false) {
|
33
63
|
throw new IllegalStateException("Only 64-bit architectures are supported");
|
@@ -35,6 +65,11 @@ public class LlamaContext {
|
|
35
65
|
if (!params.hasKey("model")) {
|
36
66
|
throw new IllegalArgumentException("Missing required parameter: model");
|
37
67
|
}
|
68
|
+
// Check if file has GGUF magic numbers
|
69
|
+
if(!isGGUF(params.getString("model"))) {
|
70
|
+
throw new IllegalArgumentException("File is not in GGUF format");
|
71
|
+
}
|
72
|
+
|
38
73
|
this.id = id;
|
39
74
|
this.context = initContext(
|
40
75
|
// String model,
|
@@ -239,28 +274,32 @@ public class LlamaContext {
|
|
239
274
|
static {
|
240
275
|
Log.d(NAME, "Primary ABI: " + Build.SUPPORTED_ABIS[0]);
|
241
276
|
if (LlamaContext.isArm64V8a()) {
|
242
|
-
|
243
|
-
|
244
|
-
// ARMv8.2a needs runtime detection support
|
245
|
-
String cpuInfo = LlamaContext.cpuInfo();
|
246
|
-
if (cpuInfo != null) {
|
247
|
-
Log.d(NAME, "CPU info: " + cpuInfo);
|
248
|
-
if (cpuInfo.contains("fphp")) {
|
249
|
-
Log.d(NAME, "CPU supports fp16 arithmetic");
|
250
|
-
loadV8fp16 = true;
|
251
|
-
}
|
252
|
-
}
|
253
|
-
}
|
277
|
+
String cpuFeatures = LlamaContext.getCpuFeatures();
|
278
|
+
Log.d(NAME, "CPU features: " + cpuFeatures);
|
254
279
|
|
255
|
-
|
256
|
-
|
257
|
-
|
280
|
+
boolean hasFp16 = cpuFeatures.contains("fp16") || cpuFeatures.contains("fphp");
|
281
|
+
boolean hasDotProd = cpuFeatures.contains("dotprod") || cpuFeatures.contains("asimddp");
|
282
|
+
boolean isAtLeastArmV82 = cpuFeatures.contains("asimd") && cpuFeatures.contains("crc32") && cpuFeatures.contains("aes");
|
283
|
+
boolean isAtLeastArmV84 = cpuFeatures.contains("dcpop") && cpuFeatures.contains("uscat");
|
284
|
+
|
285
|
+
if (isAtLeastArmV84 && hasFp16 && hasDotProd) {
|
286
|
+
Log.d(NAME, "Loading librnllama_v8_4_fp16_dotprod.so");
|
287
|
+
System.loadLibrary("rnllama_v8_4_fp16_dotprod");
|
288
|
+
} else if (isAtLeastArmV82 && hasFp16 && hasDotProd) {
|
289
|
+
Log.d(NAME, "Loading librnllama_v8_2_fp16_dotprod.so");
|
290
|
+
System.loadLibrary("rnllama_v8_2_fp16_dotprod");
|
291
|
+
} else if (isAtLeastArmV82 && hasFp16) {
|
292
|
+
Log.d(NAME, "Loading librnllama_v8_2_fp16.so");
|
293
|
+
System.loadLibrary("rnllama_v8_2_fp16");
|
258
294
|
} else {
|
259
|
-
Log.d(NAME, "Loading
|
260
|
-
System.loadLibrary("
|
295
|
+
Log.d(NAME, "Loading librnllama_v8.so");
|
296
|
+
System.loadLibrary("rnllama_v8");
|
261
297
|
}
|
262
298
|
} else if (LlamaContext.isX86_64()) {
|
263
|
-
Log.d(NAME, "Loading
|
299
|
+
Log.d(NAME, "Loading librnllama_x86_64.so");
|
300
|
+
System.loadLibrary("rnllama_x86_64");
|
301
|
+
} else {
|
302
|
+
Log.d(NAME, "Loading default librnllama.so");
|
264
303
|
System.loadLibrary("rnllama");
|
265
304
|
}
|
266
305
|
}
|
@@ -273,20 +312,23 @@ public class LlamaContext {
|
|
273
312
|
return Build.SUPPORTED_ABIS[0].equals("x86_64");
|
274
313
|
}
|
275
314
|
|
276
|
-
private static String
|
315
|
+
private static String getCpuFeatures() {
|
277
316
|
File file = new File("/proc/cpuinfo");
|
278
317
|
StringBuilder stringBuilder = new StringBuilder();
|
279
318
|
try {
|
280
319
|
BufferedReader bufferedReader = new BufferedReader(new FileReader(file));
|
281
320
|
String line;
|
282
321
|
while ((line = bufferedReader.readLine()) != null) {
|
322
|
+
if (line.startsWith("Features")) {
|
283
323
|
stringBuilder.append(line);
|
324
|
+
break;
|
325
|
+
}
|
284
326
|
}
|
285
327
|
bufferedReader.close();
|
286
328
|
return stringBuilder.toString();
|
287
329
|
} catch (IOException e) {
|
288
330
|
Log.w(NAME, "Couldn't read /proc/cpuinfo", e);
|
289
|
-
return
|
331
|
+
return "";
|
290
332
|
}
|
291
333
|
}
|
292
334
|
|
package/cpp/common.cpp
CHANGED
@@ -691,7 +691,6 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
|
|
691
691
|
if (arg == "--lora") {
|
692
692
|
CHECK_ARG
|
693
693
|
params.lora_adapter.emplace_back(argv[i], 1.0f);
|
694
|
-
params.use_mmap = false;
|
695
694
|
return true;
|
696
695
|
}
|
697
696
|
if (arg == "--lora-scaled") {
|
@@ -699,7 +698,6 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
|
|
699
698
|
const char* lora_adapter = argv[i];
|
700
699
|
CHECK_ARG
|
701
700
|
params.lora_adapter.emplace_back(lora_adapter, std::stof(argv[i]));
|
702
|
-
params.use_mmap = false;
|
703
701
|
return true;
|
704
702
|
}
|
705
703
|
if (arg == "--lora-base") {
|
@@ -2095,19 +2093,14 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
|
|
2095
2093
|
for (unsigned int i = 0; i < params.lora_adapter.size(); ++i) {
|
2096
2094
|
const std::string & lora_adapter = std::get<0>(params.lora_adapter[i]);
|
2097
2095
|
float lora_scale = std::get<1>(params.lora_adapter[i]);
|
2098
|
-
|
2099
|
-
|
2100
|
-
lora_scale,
|
2101
|
-
((i > 0) || params.lora_base.empty())
|
2102
|
-
? NULL
|
2103
|
-
: params.lora_base.c_str(),
|
2104
|
-
params.n_threads);
|
2105
|
-
if (err != 0) {
|
2096
|
+
auto adapter = llama_lora_adapter_init(model, lora_adapter.c_str());
|
2097
|
+
if (adapter == nullptr) {
|
2106
2098
|
fprintf(stderr, "%s: error: failed to apply lora adapter\n", __func__);
|
2107
2099
|
llama_free(lctx);
|
2108
2100
|
llama_free_model(model);
|
2109
2101
|
return std::make_tuple(nullptr, nullptr);
|
2110
2102
|
}
|
2103
|
+
llama_lora_adapter_set(lctx, adapter, lora_scale);
|
2111
2104
|
}
|
2112
2105
|
|
2113
2106
|
if (params.ignore_eos) {
|
@@ -2146,8 +2139,8 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
|
|
2146
2139
|
|
2147
2140
|
struct llama_model_params llama_model_params_from_gpt_params(const gpt_params & params) {
|
2148
2141
|
auto mparams = llama_model_default_params();
|
2142
|
+
mparams.vocab_only = params.vocab_only;
|
2149
2143
|
|
2150
|
-
mparams.vocab_only = params.vocab_only;
|
2151
2144
|
if (params.n_gpu_layers != -1) {
|
2152
2145
|
mparams.n_gpu_layers = params.n_gpu_layers;
|
2153
2146
|
}
|
package/cpp/common.h
CHANGED
@@ -72,7 +72,7 @@ enum dimre_method {
|
|
72
72
|
struct gpt_params {
|
73
73
|
uint32_t seed = LLAMA_DEFAULT_SEED; // RNG seed
|
74
74
|
|
75
|
-
|
75
|
+
bool vocab_only = false;
|
76
76
|
int32_t n_threads = cpu_get_num_math();
|
77
77
|
int32_t n_threads_draft = -1;
|
78
78
|
int32_t n_threads_batch = -1; // number of threads to use for batch processing (-1 = use n_threads)
|