cui-llama.rn 1.4.3 → 1.4.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +93 -114
- package/android/src/main/CMakeLists.txt +5 -0
- package/android/src/main/build-arm64/CMakeCache.txt +429 -0
- package/android/src/main/build-arm64/CMakeFiles/3.31.4/CMakeCCompiler.cmake +21 -21
- package/android/src/main/build-arm64/CMakeFiles/3.31.4/CMakeCXXCompiler.cmake +101 -0
- package/android/src/main/build-arm64/CMakeFiles/3.31.4/CMakeDetermineCompilerABI_C.bin +0 -0
- package/android/src/main/build-arm64/CMakeFiles/3.31.4/CMakeDetermineCompilerABI_CXX.bin +0 -0
- package/android/src/main/build-arm64/CMakeFiles/CMakeConfigureLog.yaml +376 -0
- package/android/src/main/build-arm64/CMakeFiles/CMakeDirectoryInformation.cmake +16 -0
- package/android/src/main/build-arm64/CMakeFiles/Makefile.cmake +165 -0
- package/android/src/main/build-arm64/CMakeFiles/Makefile2 +297 -0
- package/android/src/main/build-arm64/CMakeFiles/Progress/1 +1 -0
- package/android/src/main/build-arm64/CMakeFiles/Progress/2 +1 -0
- package/android/src/main/build-arm64/CMakeFiles/Progress/3 +1 -0
- package/android/src/main/build-arm64/CMakeFiles/Progress/4 +1 -0
- package/android/src/main/build-arm64/CMakeFiles/Progress/5 +1 -0
- package/android/src/main/build-arm64/CMakeFiles/Progress/6 +1 -0
- package/android/src/main/build-arm64/CMakeFiles/Progress/count.txt +1 -0
- package/android/src/main/build-arm64/CMakeFiles/TargetDirectories.txt +8 -0
- package/android/src/main/build-arm64/CMakeFiles/cmake.check_cache +1 -0
- package/android/src/main/build-arm64/CMakeFiles/progress.marks +1 -0
- package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-alloc.c.o +0 -0
- package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-alloc.c.o.d +58 -0
- package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-backend-reg.cpp.o +0 -0
- package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-backend-reg.cpp.o.d +756 -0
- package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-backend.cpp.o +0 -0
- package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-backend.cpp.o.d +709 -0
- package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-cpu-aarch64.cpp.o +0 -0
- package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-cpu-aarch64.cpp.o.d +714 -0
- package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-cpu-quants.c.o +0 -0
- package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-cpu-quants.c.o.d +62 -0
- package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-cpu-traits.cpp.o +0 -0
- package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-cpu-traits.cpp.o.d +708 -0
- package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-cpu.c.o +0 -0
- package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-cpu.c.o.d +113 -0
- package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-cpu.cpp.o +0 -0
- package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-cpu.cpp.o.d +713 -0
- package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-opt.cpp.o +0 -0
- package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-opt.cpp.o.d +763 -0
- package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-quants.c.o +0 -0
- package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-quants.c.o.d +61 -0
- package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-threading.cpp.o +0 -0
- package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-threading.cpp.o.d +707 -0
- package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml.c.o +0 -0
- package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml.c.o.d +104 -0
- package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/gguf.cpp.o +0 -0
- package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/gguf.cpp.o.d +714 -0
- package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/log.cpp.o +0 -0
- package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/log.cpp.o.d +723 -0
- package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/DependInfo.cmake +62 -0
- package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/build.make +722 -0
- package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/cmake_clean.cmake +89 -0
- package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/compiler_depend.make +2 -0
- package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/compiler_depend.ts +2 -0
- package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/depend.make +2 -0
- package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/flags.make +17 -0
- package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/progress.make +41 -0
- package/android/src/main/build-arm64/CMakeFiles/rnllama_v8.dir/DependInfo.cmake +62 -0
- package/android/src/main/build-arm64/CMakeFiles/rnllama_v8.dir/build.make +722 -0
- package/android/src/main/build-arm64/CMakeFiles/rnllama_v8.dir/cmake_clean.cmake +89 -0
- package/android/src/main/build-arm64/CMakeFiles/rnllama_v8.dir/compiler_depend.make +2 -0
- package/android/src/main/build-arm64/CMakeFiles/rnllama_v8.dir/compiler_depend.ts +2 -0
- package/android/src/main/build-arm64/CMakeFiles/rnllama_v8.dir/depend.make +2 -0
- package/android/src/main/build-arm64/CMakeFiles/rnllama_v8.dir/flags.make +17 -0
- package/android/src/main/build-arm64/CMakeFiles/rnllama_v8.dir/progress.make +41 -0
- package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2.dir/DependInfo.cmake +62 -0
- package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2.dir/build.make +722 -0
- package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2.dir/cmake_clean.cmake +89 -0
- package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2.dir/compiler_depend.make +2 -0
- package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2.dir/compiler_depend.ts +2 -0
- package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2.dir/depend.make +2 -0
- package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2.dir/flags.make +17 -0
- package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2.dir/progress.make +41 -0
- package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_dotprod.dir/DependInfo.cmake +62 -0
- package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_dotprod.dir/build.make +722 -0
- package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_dotprod.dir/cmake_clean.cmake +89 -0
- package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_dotprod.dir/compiler_depend.make +2 -0
- package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_dotprod.dir/compiler_depend.ts +2 -0
- package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_dotprod.dir/depend.make +2 -0
- package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_dotprod.dir/flags.make +17 -0
- package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_dotprod.dir/progress.make +41 -0
- package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_dotprod_i8mm.dir/DependInfo.cmake +62 -0
- package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_dotprod_i8mm.dir/build.make +722 -0
- package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_dotprod_i8mm.dir/cmake_clean.cmake +89 -0
- package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_dotprod_i8mm.dir/compiler_depend.make +2 -0
- package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_dotprod_i8mm.dir/compiler_depend.ts +2 -0
- package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_dotprod_i8mm.dir/depend.make +2 -0
- package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_dotprod_i8mm.dir/flags.make +17 -0
- package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_dotprod_i8mm.dir/progress.make +41 -0
- package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_i8mm.dir/DependInfo.cmake +62 -0
- package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_i8mm.dir/build.make +722 -0
- package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_i8mm.dir/cmake_clean.cmake +89 -0
- package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_i8mm.dir/compiler_depend.make +2 -0
- package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_i8mm.dir/compiler_depend.ts +2 -0
- package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_i8mm.dir/depend.make +2 -0
- package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_i8mm.dir/flags.make +17 -0
- package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_i8mm.dir/progress.make +41 -0
- package/android/src/main/build-arm64/Makefile +1862 -0
- package/android/src/main/build-arm64/cmake_install.cmake +66 -0
- package/android/src/main/java/com/rnllama/LlamaContext.java +91 -17
- package/android/src/main/java/com/rnllama/RNLlama.java +37 -4
- package/android/src/main/jni-utils.h +6 -0
- package/android/src/main/jni.cpp +287 -31
- package/android/src/main/jniLibs/arm64-v8a/librnllama.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/librnllama_v8.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2_dotprod.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2_dotprod_i8mm.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2_i8mm.so +0 -0
- package/android/src/main/jniLibs/x86_64/librnllama.so +0 -0
- package/android/src/main/jniLibs/x86_64/librnllama_x86_64.so +0 -0
- package/android/src/newarch/java/com/rnllama/RNLlamaModule.java +7 -2
- package/android/src/oldarch/java/com/rnllama/RNLlamaModule.java +7 -2
- package/cpp/chat-template.hpp +529 -0
- package/cpp/chat.cpp +1085 -0
- package/cpp/chat.hpp +55 -0
- package/cpp/common.cpp +159 -36
- package/cpp/common.h +64 -19
- package/cpp/ggml-alloc.c +1 -13
- package/cpp/ggml-common.h +0 -2
- package/cpp/ggml-cpu-impl.h +6 -12
- package/cpp/ggml-cpu-quants.c +937 -340
- package/cpp/ggml-cpu.c +207 -113
- package/cpp/ggml-cpu.cpp +4 -6
- package/cpp/ggml-cpu.h +1 -1
- package/cpp/ggml-metal.h +66 -66
- package/cpp/ggml-metal.m +141 -23
- package/cpp/ggml.c +24 -14
- package/cpp/ggml.h +2 -2
- package/cpp/json-schema-to-grammar.cpp +46 -66
- package/cpp/json-schema-to-grammar.h +15 -1
- package/cpp/llama-arch.cpp +7 -2
- package/cpp/llama-arch.h +3 -1
- package/cpp/llama-chat.cpp +10 -1
- package/cpp/llama-chat.h +1 -0
- package/cpp/llama-grammar.cpp +86 -6
- package/cpp/llama-grammar.h +22 -1
- package/cpp/llama-impl.h +6 -6
- package/cpp/llama-kv-cache.h +1 -1
- package/cpp/llama-mmap.h +1 -0
- package/cpp/llama-model-loader.cpp +1 -1
- package/cpp/llama-model.cpp +32 -6
- package/cpp/llama-sampling.cpp +178 -61
- package/cpp/llama-vocab.cpp +8 -3
- package/cpp/llama.cpp +188 -128
- package/cpp/llama.h +27 -10
- package/cpp/log.cpp +32 -10
- package/cpp/log.h +12 -1
- package/cpp/minja.hpp +2883 -0
- package/cpp/rn-llama.cpp +82 -5
- package/cpp/rn-llama.h +16 -1
- package/cpp/sampling.cpp +68 -41
- package/cpp/sampling.h +3 -0
- package/cpp/sgemm.cpp +9 -8
- package/cpp/unicode.cpp +9 -2
- package/ios/CMakeLists.txt +6 -0
- package/ios/RNLlama.h +0 -8
- package/ios/RNLlama.mm +27 -3
- package/ios/RNLlamaContext.h +10 -1
- package/ios/RNLlamaContext.mm +269 -57
- package/jest/mock.js +21 -2
- package/lib/commonjs/NativeRNLlama.js.map +1 -1
- package/lib/commonjs/grammar.js +3 -0
- package/lib/commonjs/grammar.js.map +1 -1
- package/lib/commonjs/index.js +87 -13
- package/lib/commonjs/index.js.map +1 -1
- package/lib/module/NativeRNLlama.js.map +1 -1
- package/lib/module/grammar.js +3 -0
- package/lib/module/grammar.js.map +1 -1
- package/lib/module/index.js +86 -13
- package/lib/module/index.js.map +1 -1
- package/lib/typescript/NativeRNLlama.d.ts +107 -2
- package/lib/typescript/NativeRNLlama.d.ts.map +1 -1
- package/lib/typescript/grammar.d.ts.map +1 -1
- package/lib/typescript/index.d.ts +32 -7
- package/lib/typescript/index.d.ts.map +1 -1
- package/llama-rn.podspec +1 -1
- package/package.json +3 -2
- package/src/NativeRNLlama.ts +115 -3
- package/src/grammar.ts +3 -0
- package/src/index.ts +138 -21
package/cpp/llama.cpp
CHANGED
@@ -4621,7 +4621,8 @@ struct llm_build_context {
|
|
4621
4621
|
lm_ggml_row_size(kv_pe_compresseed->type, kv_lora_rank));
|
4622
4622
|
cb(k_pe, "k_pe", il);
|
4623
4623
|
|
4624
|
-
|
4624
|
+
// TODO: the CUDA backend used to not support non-cont. (RMS) norm, investigate removing lm_ggml_cont
|
4625
|
+
kv_compressed = lm_ggml_cont(ctx0, kv_compressed);
|
4625
4626
|
kv_compressed = llm_build_norm(ctx0, kv_compressed, hparams,
|
4626
4627
|
model.layers[il].attn_kv_a_norm, NULL,
|
4627
4628
|
LLM_NORM_RMS, cb, il);
|
@@ -6475,7 +6476,8 @@ struct llm_build_context {
|
|
6475
6476
|
lm_ggml_row_size(kv_pe_compresseed->type, kv_lora_rank));
|
6476
6477
|
cb(k_pe, "k_pe", il);
|
6477
6478
|
|
6478
|
-
|
6479
|
+
// TODO: the CUDA backend used to not support non-cont. (RMS) norm, investigate removing lm_ggml_cont
|
6480
|
+
kv_compressed = lm_ggml_cont(ctx0, kv_compressed);
|
6479
6481
|
kv_compressed = llm_build_norm(ctx0, kv_compressed, hparams,
|
6480
6482
|
model.layers[il].attn_kv_a_norm, NULL,
|
6481
6483
|
LLM_NORM_RMS, cb, il);
|
@@ -7226,17 +7228,30 @@ struct llm_build_context {
|
|
7226
7228
|
struct lm_ggml_tensor * Qcur = nullptr;
|
7227
7229
|
struct lm_ggml_tensor * Kcur = nullptr;
|
7228
7230
|
struct lm_ggml_tensor * Vcur = nullptr;
|
7229
|
-
|
7230
|
-
|
7231
|
-
|
7232
|
-
|
7233
|
-
|
7234
|
-
|
7235
|
-
|
7236
|
-
|
7237
|
-
|
7238
|
-
|
7239
|
-
|
7231
|
+
if (model.layers[il].wqkv == nullptr) {
|
7232
|
+
Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
|
7233
|
+
if (model.layers[il].bq) {
|
7234
|
+
Qcur = lm_ggml_add(ctx0, Qcur, model.layers[il].bq);
|
7235
|
+
}
|
7236
|
+
Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
|
7237
|
+
if (model.layers[il].bk) {
|
7238
|
+
Kcur = lm_ggml_add(ctx0, Kcur, model.layers[il].bk);
|
7239
|
+
}
|
7240
|
+
Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
|
7241
|
+
if (model.layers[il].bv) {
|
7242
|
+
Vcur = lm_ggml_add(ctx0, Vcur, model.layers[il].bv);
|
7243
|
+
}
|
7244
|
+
} else {
|
7245
|
+
cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wqkv, cur);
|
7246
|
+
cb(cur, "wqkv", il);
|
7247
|
+
if (model.layers[il].bqkv) {
|
7248
|
+
cur = lm_ggml_add(ctx0, cur, model.layers[il].bqkv);
|
7249
|
+
cb(cur, "bqkv", il);
|
7250
|
+
}
|
7251
|
+
Qcur = lm_ggml_cont(ctx0, lm_ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
|
7252
|
+
Kcur = lm_ggml_cont(ctx0, lm_ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
|
7253
|
+
Vcur = lm_ggml_cont(ctx0, lm_ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
|
7254
|
+
}
|
7240
7255
|
cb(Qcur, "Qcur", il);
|
7241
7256
|
cb(Kcur, "Kcur", il);
|
7242
7257
|
cb(Vcur, "Vcur", il);
|
@@ -7711,17 +7726,13 @@ struct llm_build_context {
|
|
7711
7726
|
1
|
7712
7727
|
);
|
7713
7728
|
|
7729
|
+
struct lm_ggml_tensor * last_norm_att = lm_ggml_view_3d(ctx0, x_norm_att, n_embd, 1, n_seqs, x_norm_att->nb[1], x_norm_att->nb[2], (n_seq_tokens-1)*n_embd*lm_ggml_element_size(x_norm_att));
|
7714
7730
|
lm_ggml_build_forward_expand(
|
7715
7731
|
gf,
|
7716
7732
|
lm_ggml_cpy(
|
7717
7733
|
ctx0,
|
7718
|
-
|
7719
|
-
lm_ggml_view_1d(
|
7720
|
-
ctx0,
|
7721
|
-
kv_self.v_l[il],
|
7722
|
-
hparams.n_embd_v_s() * n_seqs,
|
7723
|
-
hparams.n_embd_v_s() * kv_head * lm_ggml_element_size(kv_self.v_l[il])
|
7724
|
-
)
|
7734
|
+
lm_ggml_view_1d(ctx0, last_norm_att, n_embd * n_seqs, 0),
|
7735
|
+
lm_ggml_view_1d(ctx0, kv_self.k_l[il], hparams.n_embd_k_s() * n_seqs, hparams.n_embd_k_s() * kv_head * lm_ggml_element_size(kv_self.k_l[il]))
|
7725
7736
|
)
|
7726
7737
|
);
|
7727
7738
|
|
@@ -8443,74 +8454,33 @@ static enum lm_ggml_status llama_graph_compute(
|
|
8443
8454
|
return status;
|
8444
8455
|
}
|
8445
8456
|
|
8446
|
-
|
8447
|
-
|
8448
|
-
|
8449
|
-
|
8450
|
-
//
|
8451
|
-
// - lctx: llama context
|
8452
|
-
// - batch: batch to evaluate
|
8453
|
-
//
|
8454
|
-
// return 0 on success
|
8455
|
-
// return positive int on warning
|
8456
|
-
// return negative int on error
|
8457
|
-
//
|
8458
|
-
static int llama_decode_impl(
|
8459
|
-
llama_context & lctx,
|
8460
|
-
llama_batch inp_batch) {
|
8461
|
-
|
8462
|
-
lctx.is_encoding = false;
|
8463
|
-
|
8464
|
-
if (inp_batch.n_tokens == 0) {
|
8465
|
-
LLAMA_LOG_ERROR("%s: n_tokens == 0\n", __func__);
|
8466
|
-
return -1;
|
8467
|
-
}
|
8468
|
-
|
8469
|
-
// temporary allocate memory for the input batch if needed
|
8470
|
-
llama_batch_allocr batch_allocr(inp_batch, inp_batch.pos ? -1 : lctx.kv_self.max_pos() + 1);
|
8471
|
-
|
8472
|
-
const llama_batch & batch = batch_allocr.batch;
|
8473
|
-
const uint32_t n_tokens_all = batch.n_tokens;
|
8474
|
-
|
8457
|
+
static int llama_prepare_sbatch(
|
8458
|
+
llama_context & lctx,
|
8459
|
+
const llama_batch & batch,
|
8460
|
+
uint32_t & n_outputs) {
|
8475
8461
|
const auto & model = lctx.model;
|
8476
|
-
const auto & vocab = model.vocab;
|
8477
8462
|
const auto & hparams = model.hparams;
|
8478
8463
|
const auto & cparams = lctx.cparams;
|
8479
8464
|
|
8480
|
-
|
8465
|
+
const uint32_t n_tokens_all = batch.n_tokens;
|
8466
|
+
const int64_t n_embd = hparams.n_embd;
|
8481
8467
|
|
8468
|
+
// this indicates we are doing pooled embedding, so we ignore batch.logits and output all tokens
|
8469
|
+
const bool embd_pooled = cparams.embeddings && cparams.pooling_type != LLAMA_POOLING_TYPE_NONE;
|
8470
|
+
|
8471
|
+
LM_GGML_ASSERT((!batch.token && batch.embd) || (batch.token && !batch.embd)); // NOLINT
|
8482
8472
|
if (batch.token) {
|
8483
8473
|
for (uint32_t i = 0; i < n_tokens_all; ++i) {
|
8484
|
-
if (batch.token[i] < 0 || (
|
8474
|
+
if (batch.token[i] < 0 || uint32_t(batch.token[i]) >= model.vocab.n_tokens()) {
|
8485
8475
|
LLAMA_LOG_ERROR("%s: invalid token[%d] = %d\n", __func__, i, batch.token[i]);
|
8486
8476
|
return -1;
|
8487
8477
|
}
|
8488
8478
|
}
|
8489
8479
|
}
|
8490
|
-
|
8491
8480
|
LM_GGML_ASSERT(n_tokens_all <= cparams.n_batch);
|
8492
|
-
|
8493
8481
|
LM_GGML_ASSERT((cparams.causal_attn || cparams.n_ubatch >= n_tokens_all) && "non-causal attention requires n_ubatch >= n_tokens");
|
8494
8482
|
|
8495
|
-
if (lctx.t_compute_start_us == 0) {
|
8496
|
-
lctx.t_compute_start_us = lm_ggml_time_us();
|
8497
|
-
}
|
8498
8483
|
lctx.n_queued_tokens += n_tokens_all;
|
8499
|
-
|
8500
|
-
auto & kv_self = lctx.kv_self;
|
8501
|
-
llama_kv_slot_restorer kv_slot_restorer(kv_self);
|
8502
|
-
|
8503
|
-
const int64_t n_embd = hparams.n_embd;
|
8504
|
-
const int64_t n_vocab = vocab.n_tokens();
|
8505
|
-
|
8506
|
-
uint32_t n_outputs = 0;
|
8507
|
-
uint32_t n_outputs_prev = 0;
|
8508
|
-
|
8509
|
-
const auto n_ubatch = cparams.n_ubatch;
|
8510
|
-
|
8511
|
-
// this indicates we are doing pooled embedding, so we ignore batch.logits and output all tokens
|
8512
|
-
const bool embd_pooled = cparams.embeddings && cparams.pooling_type != LLAMA_POOLING_TYPE_NONE;
|
8513
|
-
|
8514
8484
|
lctx.embd_seq.clear();
|
8515
8485
|
|
8516
8486
|
// count outputs
|
@@ -8526,7 +8496,7 @@ static int llama_decode_impl(
|
|
8526
8496
|
}
|
8527
8497
|
|
8528
8498
|
lctx.sbatch.from_batch(batch, n_embd,
|
8529
|
-
/* simple_split */ !kv_self.recurrent,
|
8499
|
+
/* simple_split */ !lctx.kv_self.recurrent,
|
8530
8500
|
/* logits_all */ n_outputs == n_tokens_all);
|
8531
8501
|
|
8532
8502
|
// reserve output buffer
|
@@ -8535,70 +8505,148 @@ static int llama_decode_impl(
|
|
8535
8505
|
return -2;
|
8536
8506
|
};
|
8537
8507
|
|
8538
|
-
|
8539
|
-
|
8540
|
-
|
8541
|
-
|
8542
|
-
|
8543
|
-
|
8544
|
-
|
8545
|
-
|
8546
|
-
|
8547
|
-
|
8548
|
-
|
8508
|
+
return 0;
|
8509
|
+
}
|
8510
|
+
|
8511
|
+
static int llama_prepare_ubatch(
|
8512
|
+
llama_context & lctx,
|
8513
|
+
llama_kv_slot_restorer & kv_slot_restorer,
|
8514
|
+
llama_ubatch & ubatch,
|
8515
|
+
const uint32_t n_outputs,
|
8516
|
+
const uint32_t n_tokens_all) {
|
8517
|
+
LM_GGML_ASSERT(lctx.sbatch.n_tokens > 0);
|
8518
|
+
|
8519
|
+
auto & kv_self = lctx.kv_self;
|
8520
|
+
const auto & cparams = lctx.cparams;
|
8521
|
+
const auto & hparams = lctx.model.hparams;
|
8522
|
+
|
8523
|
+
// this indicates we are doing pooled embedding, so we ignore batch.logits and output all tokens
|
8524
|
+
const bool embd_pooled = cparams.embeddings && cparams.pooling_type != LLAMA_POOLING_TYPE_NONE;
|
8525
|
+
|
8526
|
+
if (lctx.kv_self.recurrent) {
|
8527
|
+
if (embd_pooled) {
|
8528
|
+
// Pooled embeddings cannot be split across ubatches (yet)
|
8529
|
+
ubatch = lctx.sbatch.split_seq(cparams.n_ubatch);
|
8549
8530
|
} else {
|
8550
|
-
|
8531
|
+
// recurrent model architectures are easier to implement
|
8532
|
+
// with equal-length sequences
|
8533
|
+
ubatch = lctx.sbatch.split_equal(cparams.n_ubatch);
|
8551
8534
|
}
|
8552
|
-
|
8535
|
+
} else {
|
8536
|
+
ubatch = lctx.sbatch.split_simple(cparams.n_ubatch);
|
8537
|
+
}
|
8553
8538
|
|
8554
|
-
|
8555
|
-
|
8556
|
-
|
8539
|
+
// count the outputs in this u_batch
|
8540
|
+
{
|
8541
|
+
int32_t n_outputs_new = 0;
|
8557
8542
|
|
8558
|
-
|
8559
|
-
|
8560
|
-
|
8561
|
-
|
8562
|
-
|
8563
|
-
|
8564
|
-
}
|
8543
|
+
if (n_outputs == n_tokens_all) {
|
8544
|
+
n_outputs_new = ubatch.n_tokens;
|
8545
|
+
} else {
|
8546
|
+
LM_GGML_ASSERT(ubatch.output);
|
8547
|
+
for (uint32_t i = 0; i < ubatch.n_tokens; i++) {
|
8548
|
+
n_outputs_new += int32_t(ubatch.output[i] != 0);
|
8565
8549
|
}
|
8550
|
+
}
|
8551
|
+
|
8552
|
+
// needs to happen before the graph is built
|
8553
|
+
lctx.n_outputs = n_outputs_new;
|
8554
|
+
}
|
8566
8555
|
|
8567
|
-
|
8568
|
-
|
8556
|
+
// non-causal masks do not use the KV cache
|
8557
|
+
if (hparams.causal_attn) {
|
8558
|
+
llama_kv_cache_update(&lctx);
|
8559
|
+
|
8560
|
+
// if we have enough unused cells before the current head ->
|
8561
|
+
// better to start searching from the beginning of the cache, hoping to fill it
|
8562
|
+
if (kv_self.head > kv_self.used + 2*ubatch.n_tokens) {
|
8563
|
+
kv_self.head = 0;
|
8569
8564
|
}
|
8570
8565
|
|
8571
|
-
|
8572
|
-
|
8566
|
+
const auto slot = llama_kv_cache_find_slot(kv_self, ubatch);
|
8567
|
+
if (!slot) {
|
8568
|
+
return 1;
|
8569
|
+
}
|
8570
|
+
kv_slot_restorer.save(slot);
|
8571
|
+
|
8572
|
+
if (!kv_self.recurrent) {
|
8573
|
+
// a heuristic, to avoid attending the full cache if it is not yet utilized
|
8574
|
+
// after enough generations, the benefit from this heuristic disappears
|
8575
|
+
// if we start defragmenting the cache, the benefit from this will be more important
|
8576
|
+
const uint32_t pad = llama_kv_cache_get_padding(cparams);
|
8577
|
+
kv_self.n = std::min(kv_self.size, std::max(pad, LM_GGML_PAD(llama_kv_cache_cell_max(kv_self), pad)));
|
8578
|
+
//kv_self.n = llama_kv_cache_cell_max(kv_self);
|
8579
|
+
}
|
8580
|
+
}
|
8573
8581
|
|
8574
|
-
|
8582
|
+
return 0;
|
8583
|
+
}
|
8575
8584
|
|
8576
|
-
|
8577
|
-
|
8578
|
-
|
8585
|
+
// decode a batch of tokens by evaluating the transformer
|
8586
|
+
// in case of unsuccessful decoding (error or warning),
|
8587
|
+
// the kv_cache state will be returned to its original state
|
8588
|
+
// (for non-recurrent models) or cleaned (for recurrent models)
|
8589
|
+
//
|
8590
|
+
// - lctx: llama context
|
8591
|
+
// - inp_batch: batch to evaluate
|
8592
|
+
//
|
8593
|
+
// return 0 on success
|
8594
|
+
// return positive int on warning
|
8595
|
+
// return negative int on error
|
8596
|
+
//
|
8597
|
+
static int llama_decode_impl(
|
8598
|
+
llama_context & lctx,
|
8599
|
+
llama_batch inp_batch) {
|
8579
8600
|
|
8580
|
-
|
8581
|
-
// better to start searching from the beginning of the cache, hoping to fill it
|
8582
|
-
if (kv_self.head > kv_self.used + 2*n_tokens) {
|
8583
|
-
kv_self.head = 0;
|
8584
|
-
}
|
8601
|
+
lctx.is_encoding = false;
|
8585
8602
|
|
8586
|
-
|
8587
|
-
|
8588
|
-
|
8589
|
-
|
8590
|
-
|
8603
|
+
if (inp_batch.n_tokens == 0) {
|
8604
|
+
LLAMA_LOG_ERROR("%s: n_tokens == 0\n", __func__);
|
8605
|
+
return -1;
|
8606
|
+
}
|
8607
|
+
|
8608
|
+
// temporarily allocate memory for the input batch if needed
|
8609
|
+
llama_batch_allocr batch_allocr(inp_batch, inp_batch.pos ? -1 : lctx.kv_self.max_pos() + 1);
|
8610
|
+
const llama_batch & batch = batch_allocr.batch;
|
8611
|
+
|
8612
|
+
const auto & model = lctx.model;
|
8613
|
+
const auto & vocab = model.vocab;
|
8614
|
+
const auto & hparams = model.hparams;
|
8615
|
+
const auto & cparams = lctx.cparams;
|
8591
8616
|
|
8592
|
-
|
8593
|
-
|
8594
|
-
|
8595
|
-
|
8596
|
-
|
8597
|
-
|
8598
|
-
|
8617
|
+
if (lctx.t_compute_start_us == 0) {
|
8618
|
+
lctx.t_compute_start_us = lm_ggml_time_us();
|
8619
|
+
}
|
8620
|
+
auto & kv_self = lctx.kv_self;
|
8621
|
+
llama_kv_slot_restorer kv_slot_restorer(kv_self);
|
8622
|
+
|
8623
|
+
const int64_t n_embd = hparams.n_embd;
|
8624
|
+
const int64_t n_vocab = vocab.n_tokens();
|
8625
|
+
|
8626
|
+
uint32_t n_outputs = 0;
|
8627
|
+
uint32_t n_outputs_prev = 0;
|
8628
|
+
|
8629
|
+
{
|
8630
|
+
const int ret = llama_prepare_sbatch(lctx, batch, n_outputs);
|
8631
|
+
if (ret != 0) {
|
8632
|
+
return ret;
|
8633
|
+
}
|
8634
|
+
}
|
8635
|
+
|
8636
|
+
while (lctx.sbatch.n_tokens > 0) {
|
8637
|
+
llama_ubatch ubatch;
|
8638
|
+
{
|
8639
|
+
const int ret = llama_prepare_ubatch(lctx, kv_slot_restorer, ubatch, n_outputs, batch.n_tokens);
|
8640
|
+
if (ret != 0) {
|
8641
|
+
return ret;
|
8599
8642
|
}
|
8600
8643
|
}
|
8601
8644
|
|
8645
|
+
const int n_threads = ubatch.n_tokens == 1 ? cparams.n_threads : cparams.n_threads_batch;
|
8646
|
+
lm_ggml_threadpool_t threadpool = ubatch.n_tokens == 1 ? lctx.threadpool : lctx.threadpool_batch;
|
8647
|
+
|
8648
|
+
LM_GGML_ASSERT(n_threads > 0);
|
8649
|
+
|
8602
8650
|
//printf("kv_self.n = %5d, kv_self.used = %5d, kv_self.head = %5d\n", kv_self.n, kv_self.used, kv_self.head);
|
8603
8651
|
|
8604
8652
|
lm_ggml_backend_sched_reset(lctx.sched.get());
|
@@ -8651,7 +8699,7 @@ static int llama_decode_impl(
|
|
8651
8699
|
|
8652
8700
|
// update the kv ring buffer
|
8653
8701
|
{
|
8654
|
-
kv_self.head += n_tokens;
|
8702
|
+
kv_self.head += ubatch.n_tokens;
|
8655
8703
|
|
8656
8704
|
// Ensure kv cache head points to a valid index.
|
8657
8705
|
if (kv_self.head >= kv_self.size) {
|
@@ -8764,12 +8812,14 @@ static int llama_decode_impl(
|
|
8764
8812
|
//llama_synchronize(&lctx);
|
8765
8813
|
|
8766
8814
|
// decide if we need to defrag the kv cache
|
8767
|
-
if (cparams.causal_attn && cparams.defrag_thold
|
8768
|
-
|
8815
|
+
if (cparams.causal_attn && cparams.defrag_thold > 0.0f) {
|
8816
|
+
// - do not defrag small contexts (i.e. < 2048 tokens)
|
8817
|
+
// - count the padding towards the number of used tokens
|
8818
|
+
const float fragmentation = kv_self.n >= 2048 ? std::max(0.0f, 1.0f - float(kv_self.used + llama_kv_cache_get_padding(cparams))/float(kv_self.n)) : 0.0f;
|
8769
8819
|
|
8770
8820
|
// queue defragmentation for next llama_kv_cache_update
|
8771
8821
|
if (fragmentation > cparams.defrag_thold) {
|
8772
|
-
|
8822
|
+
LLAMA_LOG_DEBUG("%s: fragmentation: %.2f - requesting defrag\n", __func__, fragmentation);
|
8773
8823
|
|
8774
8824
|
llama_kv_cache_defrag(kv_self);
|
8775
8825
|
}
|
@@ -9391,8 +9441,6 @@ static struct llama_model * llama_model_load_from_file_impl(
|
|
9391
9441
|
struct llama_model_params params) {
|
9392
9442
|
lm_ggml_time_init();
|
9393
9443
|
|
9394
|
-
llama_model * model = new llama_model(params);
|
9395
|
-
|
9396
9444
|
unsigned cur_percentage = 0;
|
9397
9445
|
if (params.progress_callback == NULL) {
|
9398
9446
|
params.progress_callback_user_data = &cur_percentage;
|
@@ -9410,12 +9458,15 @@ static struct llama_model * llama_model_load_from_file_impl(
|
|
9410
9458
|
};
|
9411
9459
|
}
|
9412
9460
|
|
9461
|
+
llama_model * model = new llama_model(params);
|
9462
|
+
|
9413
9463
|
// create list of devices to use with this model
|
9414
9464
|
if (params.devices) {
|
9415
9465
|
for (lm_ggml_backend_dev_t * dev = params.devices; *dev; ++dev) {
|
9416
9466
|
model->devices.push_back(*dev);
|
9417
9467
|
}
|
9418
9468
|
} else {
|
9469
|
+
std::vector<lm_ggml_backend_dev_t> rpc_servers;
|
9419
9470
|
// use all available devices
|
9420
9471
|
for (size_t i = 0; i < lm_ggml_backend_dev_count(); ++i) {
|
9421
9472
|
lm_ggml_backend_dev_t dev = lm_ggml_backend_dev_get(i);
|
@@ -9426,10 +9477,19 @@ static struct llama_model * llama_model_load_from_file_impl(
|
|
9426
9477
|
break;
|
9427
9478
|
|
9428
9479
|
case LM_GGML_BACKEND_DEVICE_TYPE_GPU:
|
9429
|
-
|
9480
|
+
lm_ggml_backend_reg_t reg = lm_ggml_backend_dev_backend_reg(dev);
|
9481
|
+
if (lm_ggml_backend_reg_name(reg) == std::string("RPC")) {
|
9482
|
+
rpc_servers.push_back(dev);
|
9483
|
+
} else {
|
9484
|
+
model->devices.push_back(dev);
|
9485
|
+
}
|
9430
9486
|
break;
|
9431
9487
|
}
|
9432
9488
|
}
|
9489
|
+
// add RPC servers at the front of the list
|
9490
|
+
if (!rpc_servers.empty()) {
|
9491
|
+
model->devices.insert(model->devices.begin(), rpc_servers.begin(), rpc_servers.end());
|
9492
|
+
}
|
9433
9493
|
}
|
9434
9494
|
|
9435
9495
|
// if using single GPU mode, remove all except the main GPU
|
package/cpp/llama.h
CHANGED
@@ -214,7 +214,7 @@ extern "C" {
|
|
214
214
|
LLAMA_SPLIT_MODE_ROW = 2, // split layers and KV across GPUs, use tensor parallelism if supported
|
215
215
|
};
|
216
216
|
|
217
|
-
// TODO: simplify (https://github.com/
|
217
|
+
// TODO: simplify (https://github.com/ggml-org/llama.cpp/pull/9294#pullrequestreview-2286561979)
|
218
218
|
typedef struct llama_token_data {
|
219
219
|
llama_token id; // token id
|
220
220
|
float logit; // log-odds of the token
|
@@ -308,7 +308,7 @@ extern "C" {
|
|
308
308
|
};
|
309
309
|
|
310
310
|
// NOTE: changing the default values of parameters marked as [EXPERIMENTAL] may cause crashes or incorrect results in certain configurations
|
311
|
-
// https://github.com/
|
311
|
+
// https://github.com/ggml-org/llama.cpp/pull/7544
|
312
312
|
struct llama_context_params {
|
313
313
|
uint32_t n_ctx; // text context, 0 = from model
|
314
314
|
uint32_t n_batch; // logical maximum batch size that can be submitted to llama_decode
|
@@ -321,7 +321,7 @@ extern "C" {
|
|
321
321
|
enum llama_pooling_type pooling_type; // whether to pool (sum) embedding results by sequence id
|
322
322
|
enum llama_attention_type attention_type; // attention type to use for embeddings
|
323
323
|
|
324
|
-
// ref: https://github.com/
|
324
|
+
// ref: https://github.com/ggml-org/llama.cpp/pull/2054
|
325
325
|
float rope_freq_base; // RoPE base frequency, 0 = from model
|
326
326
|
float rope_freq_scale; // RoPE frequency scaling factor, 0 = from model
|
327
327
|
float yarn_ext_factor; // YaRN extrapolation mix factor, negative = from model
|
@@ -386,7 +386,7 @@ extern "C" {
|
|
386
386
|
struct llama_adapter_lora;
|
387
387
|
|
388
388
|
// Helpers for getting default parameters
|
389
|
-
// TODO: update API to start accepting pointers to params structs (https://github.com/
|
389
|
+
// TODO: update API to start accepting pointers to params structs (https://github.com/ggml-org/llama.cpp/discussions/9172)
|
390
390
|
LLAMA_API struct llama_model_params llama_model_default_params(void);
|
391
391
|
LLAMA_API struct llama_context_params llama_context_default_params(void);
|
392
392
|
LLAMA_API struct llama_sampler_chain_params llama_sampler_chain_default_params(void);
|
@@ -511,7 +511,8 @@ extern "C" {
|
|
511
511
|
LLAMA_API uint64_t llama_model_size(const struct llama_model * model);
|
512
512
|
|
513
513
|
// Get the default chat template. Returns nullptr if not available
|
514
|
-
|
514
|
+
// If name is NULL, returns the default chat template
|
515
|
+
LLAMA_API const char * llama_model_chat_template(const struct llama_model * model, const char * name);
|
515
516
|
|
516
517
|
// Returns the total number of parameters in the model
|
517
518
|
LLAMA_API uint64_t llama_model_n_params(const struct llama_model * model);
|
@@ -1040,7 +1041,7 @@ extern "C" {
|
|
1040
1041
|
|
1041
1042
|
/// Apply chat template. Inspired by hf apply_chat_template() on python.
|
1042
1043
|
/// Both "model" and "custom_template" are optional, but at least one is required. "custom_template" has higher precedence than "model"
|
1043
|
-
/// NOTE: This function does not use a jinja parser. It only support a pre-defined list of template. See more: https://github.com/
|
1044
|
+
/// NOTE: This function does not use a jinja parser. It only support a pre-defined list of template. See more: https://github.com/ggml-org/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template
|
1044
1045
|
/// @param tmpl A Jinja template to use for this chat. If this is nullptr, the model’s default chat template will be used instead.
|
1045
1046
|
/// @param chat Pointer to a list of multiple llama_chat_message
|
1046
1047
|
/// @param n_msg Number of llama_chat_message in this chat
|
@@ -1114,11 +1115,12 @@ extern "C" {
|
|
1114
1115
|
};
|
1115
1116
|
|
1116
1117
|
struct llama_sampler {
|
1117
|
-
struct llama_sampler_i
|
1118
|
-
llama_sampler_context_t
|
1118
|
+
const struct llama_sampler_i * iface;
|
1119
|
+
llama_sampler_context_t ctx;
|
1119
1120
|
};
|
1120
1121
|
|
1121
1122
|
// mirror of llama_sampler_i:
|
1123
|
+
LLAMA_API struct llama_sampler * llama_sampler_init (const struct llama_sampler_i * iface, llama_sampler_context_t ctx);
|
1122
1124
|
LLAMA_API const char * llama_sampler_name (const struct llama_sampler * smpl);
|
1123
1125
|
LLAMA_API void llama_sampler_accept( struct llama_sampler * smpl, llama_token token);
|
1124
1126
|
LLAMA_API void llama_sampler_apply ( struct llama_sampler * smpl, llama_token_data_array * cur_p);
|
@@ -1148,7 +1150,7 @@ extern "C" {
|
|
1148
1150
|
/// @details Sorts candidate tokens by their logits in descending order and calculate probabilities based on logits.
|
1149
1151
|
/// NOTE: Avoid using on the full vocabulary as the sorting can become slow. For example, apply top-k or top-p sampling first.
|
1150
1152
|
DEPRECATED(LLAMA_API struct llama_sampler * llama_sampler_init_softmax (void),
|
1151
|
-
"will be removed in the future (see https://github.com/
|
1153
|
+
"will be removed in the future (see https://github.com/ggml-org/llama.cpp/pull/9896#discussion_r1800920915)");
|
1152
1154
|
|
1153
1155
|
/// @details Top-K sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
|
1154
1156
|
LLAMA_API struct llama_sampler * llama_sampler_init_top_k (int32_t k);
|
@@ -1156,7 +1158,7 @@ extern "C" {
|
|
1156
1158
|
/// @details Nucleus sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
|
1157
1159
|
LLAMA_API struct llama_sampler * llama_sampler_init_top_p (float p, size_t min_keep);
|
1158
1160
|
|
1159
|
-
/// @details Minimum P sampling as described in https://github.com/
|
1161
|
+
/// @details Minimum P sampling as described in https://github.com/ggml-org/llama.cpp/pull/3841
|
1160
1162
|
LLAMA_API struct llama_sampler * llama_sampler_init_min_p (float p, size_t min_keep);
|
1161
1163
|
|
1162
1164
|
/// @details Locally Typical Sampling implementation described in the paper https://arxiv.org/abs/2202.00666.
|
@@ -1171,6 +1173,9 @@ extern "C" {
|
|
1171
1173
|
/// @details XTC sampler as described in https://github.com/oobabooga/text-generation-webui/pull/6335
|
1172
1174
|
LLAMA_API struct llama_sampler * llama_sampler_init_xtc (float p, float t, size_t min_keep, uint32_t seed);
|
1173
1175
|
|
1176
|
+
/// @details Top n sigma sampling as described in academic paper "Top-nσ: Not All Logits Are You Need" https://arxiv.org/pdf/2411.07641
|
1177
|
+
LLAMA_API struct llama_sampler * llama_sampler_init_top_n_sigma(float n);
|
1178
|
+
|
1174
1179
|
/// @details Mirostat 1.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words.
|
1175
1180
|
/// @param candidates A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text.
|
1176
1181
|
/// @param tau The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text.
|
@@ -1199,6 +1204,18 @@ extern "C" {
|
|
1199
1204
|
const char * grammar_str,
|
1200
1205
|
const char * grammar_root);
|
1201
1206
|
|
1207
|
+
/// @details Lazy grammar sampler, introduced in https://github.com/ggml-org/llama.cpp/pull/9639
|
1208
|
+
/// @param trigger_words A list of words that will trigger the grammar sampler. This may be updated to a loose regex syntax (w/ ^) in a near future.
|
1209
|
+
/// @param trigger_tokens A list of tokens that will trigger the grammar sampler.
|
1210
|
+
LLAMA_API struct llama_sampler * llama_sampler_init_grammar_lazy(
|
1211
|
+
const struct llama_vocab * vocab,
|
1212
|
+
const char * grammar_str,
|
1213
|
+
const char * grammar_root,
|
1214
|
+
const char ** trigger_words,
|
1215
|
+
size_t num_trigger_words,
|
1216
|
+
const llama_token * trigger_tokens,
|
1217
|
+
size_t num_trigger_tokens);
|
1218
|
+
|
1202
1219
|
/// NOTE: Avoid using on the full vocabulary as searching for repeated tokens can become slow. For example, apply top-k or top-p sampling first.
|
1203
1220
|
LLAMA_API struct llama_sampler * llama_sampler_init_penalties(
|
1204
1221
|
int32_t penalty_last_n, // last n tokens to penalize (0 = disable penalty, -1 = context size)
|
package/cpp/log.cpp
CHANGED
@@ -1,5 +1,6 @@
|
|
1
1
|
#include "log.h"
|
2
2
|
|
3
|
+
#include <chrono>
|
3
4
|
#include <condition_variable>
|
4
5
|
#include <cstdarg>
|
5
6
|
#include <cstdio>
|
@@ -8,22 +9,16 @@
|
|
8
9
|
#include <thread>
|
9
10
|
#include <vector>
|
10
11
|
|
12
|
+
#if defined(__ANDROID__) && defined(RNLLAMA_ANDROID_ENABLE_LOGGING)
|
13
|
+
#include <android/log.h>
|
14
|
+
#endif
|
15
|
+
|
11
16
|
int common_log_verbosity_thold = LOG_DEFAULT_LLAMA;
|
12
17
|
|
13
18
|
void common_log_set_verbosity_thold(int verbosity) {
|
14
19
|
common_log_verbosity_thold = verbosity;
|
15
20
|
}
|
16
21
|
|
17
|
-
#define LOG_COL_DEFAULT "\033[0m"
|
18
|
-
#define LOG_COL_BOLD "\033[1m"
|
19
|
-
#define LOG_COL_RED "\033[31m"
|
20
|
-
#define LOG_COL_GREEN "\033[32m"
|
21
|
-
#define LOG_COL_YELLOW "\033[33m"
|
22
|
-
#define LOG_COL_BLUE "\033[34m"
|
23
|
-
#define LOG_COL_MAGENTA "\033[35m"
|
24
|
-
#define LOG_COL_CYAN "\033[36m"
|
25
|
-
#define LOG_COL_WHITE "\033[37m"
|
26
|
-
|
27
22
|
static int64_t t_us() {
|
28
23
|
return std::chrono::duration_cast<std::chrono::microseconds>(std::chrono::system_clock::now().time_since_epoch()).count();
|
29
24
|
}
|
@@ -66,6 +61,32 @@ struct common_log_entry {
|
|
66
61
|
// signals the worker thread to stop
|
67
62
|
bool is_end;
|
68
63
|
|
64
|
+
#if defined(__ANDROID__) && defined(RNLLAMA_ANDROID_ENABLE_LOGGING)
|
65
|
+
void android_print() const {
|
66
|
+
int android_log_priority;
|
67
|
+
switch (level) {
|
68
|
+
case LM_GGML_LOG_LEVEL_INFO:
|
69
|
+
android_log_priority = ANDROID_LOG_INFO;
|
70
|
+
break;
|
71
|
+
case LM_GGML_LOG_LEVEL_WARN:
|
72
|
+
android_log_priority = ANDROID_LOG_WARN;
|
73
|
+
break;
|
74
|
+
case LM_GGML_LOG_LEVEL_ERROR:
|
75
|
+
android_log_priority = ANDROID_LOG_ERROR;
|
76
|
+
break;
|
77
|
+
case LM_GGML_LOG_LEVEL_DEBUG:
|
78
|
+
android_log_priority = ANDROID_LOG_DEBUG;
|
79
|
+
break;
|
80
|
+
default:
|
81
|
+
android_log_priority = ANDROID_LOG_DEFAULT;
|
82
|
+
break;
|
83
|
+
}
|
84
|
+
|
85
|
+
const char * tag = "RNLLAMA_LOG_ANDROID";
|
86
|
+
__android_log_print(android_log_priority, tag, "%s", msg.data());
|
87
|
+
}
|
88
|
+
#endif
|
89
|
+
|
69
90
|
void print(FILE * file = nullptr) const {
|
70
91
|
FILE * fcur = file;
|
71
92
|
if (!fcur) {
|
@@ -206,6 +227,7 @@ public:
|
|
206
227
|
vsnprintf(entry.msg.data(), entry.msg.size(), ss.str().c_str(), args_copy);
|
207
228
|
}
|
208
229
|
#endif
|
230
|
+
va_end(args_copy);
|
209
231
|
}
|
210
232
|
|
211
233
|
entry.level = level;
|
package/cpp/log.h
CHANGED
@@ -2,9 +2,20 @@
|
|
2
2
|
|
3
3
|
#include "ggml.h" // for lm_ggml_log_level
|
4
4
|
|
5
|
+
#define LOG_CLR_TO_EOL "\033[K\r"
|
6
|
+
#define LOG_COL_DEFAULT "\033[0m"
|
7
|
+
#define LOG_COL_BOLD "\033[1m"
|
8
|
+
#define LOG_COL_RED "\033[31m"
|
9
|
+
#define LOG_COL_GREEN "\033[32m"
|
10
|
+
#define LOG_COL_YELLOW "\033[33m"
|
11
|
+
#define LOG_COL_BLUE "\033[34m"
|
12
|
+
#define LOG_COL_MAGENTA "\033[35m"
|
13
|
+
#define LOG_COL_CYAN "\033[36m"
|
14
|
+
#define LOG_COL_WHITE "\033[37m"
|
15
|
+
|
5
16
|
#ifndef __GNUC__
|
6
17
|
# define LOG_ATTRIBUTE_FORMAT(...)
|
7
|
-
#elif defined(__MINGW32__)
|
18
|
+
#elif defined(__MINGW32__) && !defined(__clang__)
|
8
19
|
# define LOG_ATTRIBUTE_FORMAT(...) __attribute__((format(gnu_printf, __VA_ARGS__)))
|
9
20
|
#else
|
10
21
|
# define LOG_ATTRIBUTE_FORMAT(...) __attribute__((format(printf, __VA_ARGS__)))
|