cui-llama.rn 1.4.3 → 1.4.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (181) hide show
  1. package/README.md +93 -114
  2. package/android/src/main/CMakeLists.txt +5 -0
  3. package/android/src/main/build-arm64/CMakeCache.txt +429 -0
  4. package/android/src/main/build-arm64/CMakeFiles/3.31.4/CMakeCCompiler.cmake +21 -21
  5. package/android/src/main/build-arm64/CMakeFiles/3.31.4/CMakeCXXCompiler.cmake +101 -0
  6. package/android/src/main/build-arm64/CMakeFiles/3.31.4/CMakeDetermineCompilerABI_C.bin +0 -0
  7. package/android/src/main/build-arm64/CMakeFiles/3.31.4/CMakeDetermineCompilerABI_CXX.bin +0 -0
  8. package/android/src/main/build-arm64/CMakeFiles/CMakeConfigureLog.yaml +376 -0
  9. package/android/src/main/build-arm64/CMakeFiles/CMakeDirectoryInformation.cmake +16 -0
  10. package/android/src/main/build-arm64/CMakeFiles/Makefile.cmake +165 -0
  11. package/android/src/main/build-arm64/CMakeFiles/Makefile2 +297 -0
  12. package/android/src/main/build-arm64/CMakeFiles/Progress/1 +1 -0
  13. package/android/src/main/build-arm64/CMakeFiles/Progress/2 +1 -0
  14. package/android/src/main/build-arm64/CMakeFiles/Progress/3 +1 -0
  15. package/android/src/main/build-arm64/CMakeFiles/Progress/4 +1 -0
  16. package/android/src/main/build-arm64/CMakeFiles/Progress/5 +1 -0
  17. package/android/src/main/build-arm64/CMakeFiles/Progress/6 +1 -0
  18. package/android/src/main/build-arm64/CMakeFiles/Progress/count.txt +1 -0
  19. package/android/src/main/build-arm64/CMakeFiles/TargetDirectories.txt +8 -0
  20. package/android/src/main/build-arm64/CMakeFiles/cmake.check_cache +1 -0
  21. package/android/src/main/build-arm64/CMakeFiles/progress.marks +1 -0
  22. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-alloc.c.o +0 -0
  23. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-alloc.c.o.d +58 -0
  24. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-backend-reg.cpp.o +0 -0
  25. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-backend-reg.cpp.o.d +756 -0
  26. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-backend.cpp.o +0 -0
  27. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-backend.cpp.o.d +709 -0
  28. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-cpu-aarch64.cpp.o +0 -0
  29. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-cpu-aarch64.cpp.o.d +714 -0
  30. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-cpu-quants.c.o +0 -0
  31. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-cpu-quants.c.o.d +62 -0
  32. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-cpu-traits.cpp.o +0 -0
  33. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-cpu-traits.cpp.o.d +708 -0
  34. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-cpu.c.o +0 -0
  35. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-cpu.c.o.d +113 -0
  36. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-cpu.cpp.o +0 -0
  37. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-cpu.cpp.o.d +713 -0
  38. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-opt.cpp.o +0 -0
  39. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-opt.cpp.o.d +763 -0
  40. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-quants.c.o +0 -0
  41. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-quants.c.o.d +61 -0
  42. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-threading.cpp.o +0 -0
  43. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-threading.cpp.o.d +707 -0
  44. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml.c.o +0 -0
  45. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml.c.o.d +104 -0
  46. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/gguf.cpp.o +0 -0
  47. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/gguf.cpp.o.d +714 -0
  48. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/log.cpp.o +0 -0
  49. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/log.cpp.o.d +723 -0
  50. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/DependInfo.cmake +62 -0
  51. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/build.make +722 -0
  52. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/cmake_clean.cmake +89 -0
  53. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/compiler_depend.make +2 -0
  54. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/compiler_depend.ts +2 -0
  55. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/depend.make +2 -0
  56. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/flags.make +17 -0
  57. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/progress.make +41 -0
  58. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8.dir/DependInfo.cmake +62 -0
  59. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8.dir/build.make +722 -0
  60. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8.dir/cmake_clean.cmake +89 -0
  61. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8.dir/compiler_depend.make +2 -0
  62. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8.dir/compiler_depend.ts +2 -0
  63. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8.dir/depend.make +2 -0
  64. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8.dir/flags.make +17 -0
  65. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8.dir/progress.make +41 -0
  66. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2.dir/DependInfo.cmake +62 -0
  67. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2.dir/build.make +722 -0
  68. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2.dir/cmake_clean.cmake +89 -0
  69. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2.dir/compiler_depend.make +2 -0
  70. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2.dir/compiler_depend.ts +2 -0
  71. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2.dir/depend.make +2 -0
  72. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2.dir/flags.make +17 -0
  73. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2.dir/progress.make +41 -0
  74. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_dotprod.dir/DependInfo.cmake +62 -0
  75. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_dotprod.dir/build.make +722 -0
  76. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_dotprod.dir/cmake_clean.cmake +89 -0
  77. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_dotprod.dir/compiler_depend.make +2 -0
  78. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_dotprod.dir/compiler_depend.ts +2 -0
  79. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_dotprod.dir/depend.make +2 -0
  80. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_dotprod.dir/flags.make +17 -0
  81. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_dotprod.dir/progress.make +41 -0
  82. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_dotprod_i8mm.dir/DependInfo.cmake +62 -0
  83. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_dotprod_i8mm.dir/build.make +722 -0
  84. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_dotprod_i8mm.dir/cmake_clean.cmake +89 -0
  85. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_dotprod_i8mm.dir/compiler_depend.make +2 -0
  86. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_dotprod_i8mm.dir/compiler_depend.ts +2 -0
  87. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_dotprod_i8mm.dir/depend.make +2 -0
  88. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_dotprod_i8mm.dir/flags.make +17 -0
  89. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_dotprod_i8mm.dir/progress.make +41 -0
  90. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_i8mm.dir/DependInfo.cmake +62 -0
  91. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_i8mm.dir/build.make +722 -0
  92. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_i8mm.dir/cmake_clean.cmake +89 -0
  93. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_i8mm.dir/compiler_depend.make +2 -0
  94. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_i8mm.dir/compiler_depend.ts +2 -0
  95. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_i8mm.dir/depend.make +2 -0
  96. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_i8mm.dir/flags.make +17 -0
  97. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_i8mm.dir/progress.make +41 -0
  98. package/android/src/main/build-arm64/Makefile +1862 -0
  99. package/android/src/main/build-arm64/cmake_install.cmake +66 -0
  100. package/android/src/main/java/com/rnllama/LlamaContext.java +91 -17
  101. package/android/src/main/java/com/rnllama/RNLlama.java +37 -4
  102. package/android/src/main/jni-utils.h +6 -0
  103. package/android/src/main/jni.cpp +287 -31
  104. package/android/src/main/jniLibs/arm64-v8a/librnllama.so +0 -0
  105. package/android/src/main/jniLibs/arm64-v8a/librnllama_v8.so +0 -0
  106. package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2.so +0 -0
  107. package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2_dotprod.so +0 -0
  108. package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2_dotprod_i8mm.so +0 -0
  109. package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2_i8mm.so +0 -0
  110. package/android/src/main/jniLibs/x86_64/librnllama.so +0 -0
  111. package/android/src/main/jniLibs/x86_64/librnllama_x86_64.so +0 -0
  112. package/android/src/newarch/java/com/rnllama/RNLlamaModule.java +7 -2
  113. package/android/src/oldarch/java/com/rnllama/RNLlamaModule.java +7 -2
  114. package/cpp/chat-template.hpp +529 -0
  115. package/cpp/chat.cpp +1085 -0
  116. package/cpp/chat.hpp +55 -0
  117. package/cpp/common.cpp +159 -36
  118. package/cpp/common.h +64 -19
  119. package/cpp/ggml-alloc.c +1 -13
  120. package/cpp/ggml-common.h +0 -2
  121. package/cpp/ggml-cpu-impl.h +6 -12
  122. package/cpp/ggml-cpu-quants.c +937 -340
  123. package/cpp/ggml-cpu.c +207 -113
  124. package/cpp/ggml-cpu.cpp +4 -6
  125. package/cpp/ggml-cpu.h +1 -1
  126. package/cpp/ggml-metal.h +66 -66
  127. package/cpp/ggml-metal.m +141 -23
  128. package/cpp/ggml.c +24 -14
  129. package/cpp/ggml.h +2 -2
  130. package/cpp/json-schema-to-grammar.cpp +46 -66
  131. package/cpp/json-schema-to-grammar.h +15 -1
  132. package/cpp/llama-arch.cpp +7 -2
  133. package/cpp/llama-arch.h +3 -1
  134. package/cpp/llama-chat.cpp +10 -1
  135. package/cpp/llama-chat.h +1 -0
  136. package/cpp/llama-grammar.cpp +86 -6
  137. package/cpp/llama-grammar.h +22 -1
  138. package/cpp/llama-impl.h +6 -6
  139. package/cpp/llama-kv-cache.h +1 -1
  140. package/cpp/llama-mmap.h +1 -0
  141. package/cpp/llama-model-loader.cpp +1 -1
  142. package/cpp/llama-model.cpp +32 -6
  143. package/cpp/llama-sampling.cpp +178 -61
  144. package/cpp/llama-vocab.cpp +8 -3
  145. package/cpp/llama.cpp +188 -128
  146. package/cpp/llama.h +27 -10
  147. package/cpp/log.cpp +32 -10
  148. package/cpp/log.h +12 -1
  149. package/cpp/minja.hpp +2883 -0
  150. package/cpp/rn-llama.cpp +82 -5
  151. package/cpp/rn-llama.h +16 -1
  152. package/cpp/sampling.cpp +68 -41
  153. package/cpp/sampling.h +3 -0
  154. package/cpp/sgemm.cpp +9 -8
  155. package/cpp/unicode.cpp +9 -2
  156. package/ios/CMakeLists.txt +6 -0
  157. package/ios/RNLlama.h +0 -8
  158. package/ios/RNLlama.mm +27 -3
  159. package/ios/RNLlamaContext.h +10 -1
  160. package/ios/RNLlamaContext.mm +269 -57
  161. package/jest/mock.js +21 -2
  162. package/lib/commonjs/NativeRNLlama.js.map +1 -1
  163. package/lib/commonjs/grammar.js +3 -0
  164. package/lib/commonjs/grammar.js.map +1 -1
  165. package/lib/commonjs/index.js +87 -13
  166. package/lib/commonjs/index.js.map +1 -1
  167. package/lib/module/NativeRNLlama.js.map +1 -1
  168. package/lib/module/grammar.js +3 -0
  169. package/lib/module/grammar.js.map +1 -1
  170. package/lib/module/index.js +86 -13
  171. package/lib/module/index.js.map +1 -1
  172. package/lib/typescript/NativeRNLlama.d.ts +107 -2
  173. package/lib/typescript/NativeRNLlama.d.ts.map +1 -1
  174. package/lib/typescript/grammar.d.ts.map +1 -1
  175. package/lib/typescript/index.d.ts +32 -7
  176. package/lib/typescript/index.d.ts.map +1 -1
  177. package/llama-rn.podspec +1 -1
  178. package/package.json +3 -2
  179. package/src/NativeRNLlama.ts +115 -3
  180. package/src/grammar.ts +3 -0
  181. package/src/index.ts +138 -21
package/cpp/llama.cpp CHANGED
@@ -4621,7 +4621,8 @@ struct llm_build_context {
4621
4621
  lm_ggml_row_size(kv_pe_compresseed->type, kv_lora_rank));
4622
4622
  cb(k_pe, "k_pe", il);
4623
4623
 
4624
- kv_compressed = lm_ggml_cont(ctx0, kv_compressed); // TODO: the CUDA backend does not support non-contiguous norm
4624
+ // TODO: the CUDA backend used to not support non-cont. (RMS) norm, investigate removing lm_ggml_cont
4625
+ kv_compressed = lm_ggml_cont(ctx0, kv_compressed);
4625
4626
  kv_compressed = llm_build_norm(ctx0, kv_compressed, hparams,
4626
4627
  model.layers[il].attn_kv_a_norm, NULL,
4627
4628
  LLM_NORM_RMS, cb, il);
@@ -6475,7 +6476,8 @@ struct llm_build_context {
6475
6476
  lm_ggml_row_size(kv_pe_compresseed->type, kv_lora_rank));
6476
6477
  cb(k_pe, "k_pe", il);
6477
6478
 
6478
- kv_compressed = lm_ggml_cont(ctx0, kv_compressed); // TODO: the CUDA backend does not support non-contiguous norm
6479
+ // TODO: the CUDA backend used to not support non-cont. (RMS) norm, investigate removing lm_ggml_cont
6480
+ kv_compressed = lm_ggml_cont(ctx0, kv_compressed);
6479
6481
  kv_compressed = llm_build_norm(ctx0, kv_compressed, hparams,
6480
6482
  model.layers[il].attn_kv_a_norm, NULL,
6481
6483
  LLM_NORM_RMS, cb, il);
@@ -7226,17 +7228,30 @@ struct llm_build_context {
7226
7228
  struct lm_ggml_tensor * Qcur = nullptr;
7227
7229
  struct lm_ggml_tensor * Kcur = nullptr;
7228
7230
  struct lm_ggml_tensor * Vcur = nullptr;
7229
-
7230
- cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wqkv, cur);
7231
- cb(cur, "wqkv", il);
7232
-
7233
- cur = lm_ggml_add(ctx0, cur, model.layers[il].bqkv);
7234
- cb(cur, "bqkv", il);
7235
-
7236
- Qcur = lm_ggml_cont(ctx0, lm_ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
7237
- Kcur = lm_ggml_cont(ctx0, lm_ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
7238
- Vcur = lm_ggml_cont(ctx0, lm_ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
7239
-
7231
+ if (model.layers[il].wqkv == nullptr) {
7232
+ Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
7233
+ if (model.layers[il].bq) {
7234
+ Qcur = lm_ggml_add(ctx0, Qcur, model.layers[il].bq);
7235
+ }
7236
+ Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
7237
+ if (model.layers[il].bk) {
7238
+ Kcur = lm_ggml_add(ctx0, Kcur, model.layers[il].bk);
7239
+ }
7240
+ Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
7241
+ if (model.layers[il].bv) {
7242
+ Vcur = lm_ggml_add(ctx0, Vcur, model.layers[il].bv);
7243
+ }
7244
+ } else {
7245
+ cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wqkv, cur);
7246
+ cb(cur, "wqkv", il);
7247
+ if (model.layers[il].bqkv) {
7248
+ cur = lm_ggml_add(ctx0, cur, model.layers[il].bqkv);
7249
+ cb(cur, "bqkv", il);
7250
+ }
7251
+ Qcur = lm_ggml_cont(ctx0, lm_ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
7252
+ Kcur = lm_ggml_cont(ctx0, lm_ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
7253
+ Vcur = lm_ggml_cont(ctx0, lm_ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
7254
+ }
7240
7255
  cb(Qcur, "Qcur", il);
7241
7256
  cb(Kcur, "Kcur", il);
7242
7257
  cb(Vcur, "Vcur", il);
@@ -7711,17 +7726,13 @@ struct llm_build_context {
7711
7726
  1
7712
7727
  );
7713
7728
 
7729
+ struct lm_ggml_tensor * last_norm_att = lm_ggml_view_3d(ctx0, x_norm_att, n_embd, 1, n_seqs, x_norm_att->nb[1], x_norm_att->nb[2], (n_seq_tokens-1)*n_embd*lm_ggml_element_size(x_norm_att));
7714
7730
  lm_ggml_build_forward_expand(
7715
7731
  gf,
7716
7732
  lm_ggml_cpy(
7717
7733
  ctx0,
7718
- wkv_states,
7719
- lm_ggml_view_1d(
7720
- ctx0,
7721
- kv_self.v_l[il],
7722
- hparams.n_embd_v_s() * n_seqs,
7723
- hparams.n_embd_v_s() * kv_head * lm_ggml_element_size(kv_self.v_l[il])
7724
- )
7734
+ lm_ggml_view_1d(ctx0, last_norm_att, n_embd * n_seqs, 0),
7735
+ lm_ggml_view_1d(ctx0, kv_self.k_l[il], hparams.n_embd_k_s() * n_seqs, hparams.n_embd_k_s() * kv_head * lm_ggml_element_size(kv_self.k_l[il]))
7725
7736
  )
7726
7737
  );
7727
7738
 
@@ -8443,74 +8454,33 @@ static enum lm_ggml_status llama_graph_compute(
8443
8454
  return status;
8444
8455
  }
8445
8456
 
8446
- // decode a batch of tokens by evaluating the transformer
8447
- // in case of unsuccessful decoding (error or warning),
8448
- // the kv_cache state will be returned to its original state
8449
- // (for non-recurrent models) or cleaned (for recurrent models)
8450
- //
8451
- // - lctx: llama context
8452
- // - batch: batch to evaluate
8453
- //
8454
- // return 0 on success
8455
- // return positive int on warning
8456
- // return negative int on error
8457
- //
8458
- static int llama_decode_impl(
8459
- llama_context & lctx,
8460
- llama_batch inp_batch) {
8461
-
8462
- lctx.is_encoding = false;
8463
-
8464
- if (inp_batch.n_tokens == 0) {
8465
- LLAMA_LOG_ERROR("%s: n_tokens == 0\n", __func__);
8466
- return -1;
8467
- }
8468
-
8469
- // temporary allocate memory for the input batch if needed
8470
- llama_batch_allocr batch_allocr(inp_batch, inp_batch.pos ? -1 : lctx.kv_self.max_pos() + 1);
8471
-
8472
- const llama_batch & batch = batch_allocr.batch;
8473
- const uint32_t n_tokens_all = batch.n_tokens;
8474
-
8457
+ static int llama_prepare_sbatch(
8458
+ llama_context & lctx,
8459
+ const llama_batch & batch,
8460
+ uint32_t & n_outputs) {
8475
8461
  const auto & model = lctx.model;
8476
- const auto & vocab = model.vocab;
8477
8462
  const auto & hparams = model.hparams;
8478
8463
  const auto & cparams = lctx.cparams;
8479
8464
 
8480
- LM_GGML_ASSERT((!batch.token && batch.embd) || (batch.token && !batch.embd)); // NOLINT
8465
+ const uint32_t n_tokens_all = batch.n_tokens;
8466
+ const int64_t n_embd = hparams.n_embd;
8481
8467
 
8468
+ // this indicates we are doing pooled embedding, so we ignore batch.logits and output all tokens
8469
+ const bool embd_pooled = cparams.embeddings && cparams.pooling_type != LLAMA_POOLING_TYPE_NONE;
8470
+
8471
+ LM_GGML_ASSERT((!batch.token && batch.embd) || (batch.token && !batch.embd)); // NOLINT
8482
8472
  if (batch.token) {
8483
8473
  for (uint32_t i = 0; i < n_tokens_all; ++i) {
8484
- if (batch.token[i] < 0 || (uint32_t) batch.token[i] >= model.vocab.n_tokens()) {
8474
+ if (batch.token[i] < 0 || uint32_t(batch.token[i]) >= model.vocab.n_tokens()) {
8485
8475
  LLAMA_LOG_ERROR("%s: invalid token[%d] = %d\n", __func__, i, batch.token[i]);
8486
8476
  return -1;
8487
8477
  }
8488
8478
  }
8489
8479
  }
8490
-
8491
8480
  LM_GGML_ASSERT(n_tokens_all <= cparams.n_batch);
8492
-
8493
8481
  LM_GGML_ASSERT((cparams.causal_attn || cparams.n_ubatch >= n_tokens_all) && "non-causal attention requires n_ubatch >= n_tokens");
8494
8482
 
8495
- if (lctx.t_compute_start_us == 0) {
8496
- lctx.t_compute_start_us = lm_ggml_time_us();
8497
- }
8498
8483
  lctx.n_queued_tokens += n_tokens_all;
8499
-
8500
- auto & kv_self = lctx.kv_self;
8501
- llama_kv_slot_restorer kv_slot_restorer(kv_self);
8502
-
8503
- const int64_t n_embd = hparams.n_embd;
8504
- const int64_t n_vocab = vocab.n_tokens();
8505
-
8506
- uint32_t n_outputs = 0;
8507
- uint32_t n_outputs_prev = 0;
8508
-
8509
- const auto n_ubatch = cparams.n_ubatch;
8510
-
8511
- // this indicates we are doing pooled embedding, so we ignore batch.logits and output all tokens
8512
- const bool embd_pooled = cparams.embeddings && cparams.pooling_type != LLAMA_POOLING_TYPE_NONE;
8513
-
8514
8484
  lctx.embd_seq.clear();
8515
8485
 
8516
8486
  // count outputs
@@ -8526,7 +8496,7 @@ static int llama_decode_impl(
8526
8496
  }
8527
8497
 
8528
8498
  lctx.sbatch.from_batch(batch, n_embd,
8529
- /* simple_split */ !kv_self.recurrent,
8499
+ /* simple_split */ !lctx.kv_self.recurrent,
8530
8500
  /* logits_all */ n_outputs == n_tokens_all);
8531
8501
 
8532
8502
  // reserve output buffer
@@ -8535,70 +8505,148 @@ static int llama_decode_impl(
8535
8505
  return -2;
8536
8506
  };
8537
8507
 
8538
- while (lctx.sbatch.n_tokens > 0) {
8539
- llama_ubatch ubatch;
8540
- if (kv_self.recurrent) {
8541
- if (embd_pooled) {
8542
- // Pooled embeddings cannot be split across ubatches (yet)
8543
- ubatch = lctx.sbatch.split_seq(n_ubatch);
8544
- } else {
8545
- // recurrent model architectures are easier to implement
8546
- // with equal-length sequences
8547
- ubatch = lctx.sbatch.split_equal(n_ubatch);
8548
- }
8508
+ return 0;
8509
+ }
8510
+
8511
+ static int llama_prepare_ubatch(
8512
+ llama_context & lctx,
8513
+ llama_kv_slot_restorer & kv_slot_restorer,
8514
+ llama_ubatch & ubatch,
8515
+ const uint32_t n_outputs,
8516
+ const uint32_t n_tokens_all) {
8517
+ LM_GGML_ASSERT(lctx.sbatch.n_tokens > 0);
8518
+
8519
+ auto & kv_self = lctx.kv_self;
8520
+ const auto & cparams = lctx.cparams;
8521
+ const auto & hparams = lctx.model.hparams;
8522
+
8523
+ // this indicates we are doing pooled embedding, so we ignore batch.logits and output all tokens
8524
+ const bool embd_pooled = cparams.embeddings && cparams.pooling_type != LLAMA_POOLING_TYPE_NONE;
8525
+
8526
+ if (lctx.kv_self.recurrent) {
8527
+ if (embd_pooled) {
8528
+ // Pooled embeddings cannot be split across ubatches (yet)
8529
+ ubatch = lctx.sbatch.split_seq(cparams.n_ubatch);
8549
8530
  } else {
8550
- ubatch = lctx.sbatch.split_simple(n_ubatch);
8531
+ // recurrent model architectures are easier to implement
8532
+ // with equal-length sequences
8533
+ ubatch = lctx.sbatch.split_equal(cparams.n_ubatch);
8551
8534
  }
8552
- const uint32_t n_tokens = ubatch.n_tokens;
8535
+ } else {
8536
+ ubatch = lctx.sbatch.split_simple(cparams.n_ubatch);
8537
+ }
8553
8538
 
8554
- // count the outputs in this u_batch
8555
- {
8556
- int32_t n_outputs_new = 0;
8539
+ // count the outputs in this u_batch
8540
+ {
8541
+ int32_t n_outputs_new = 0;
8557
8542
 
8558
- if (n_outputs == n_tokens_all) {
8559
- n_outputs_new = n_tokens;
8560
- } else {
8561
- LM_GGML_ASSERT(ubatch.output);
8562
- for (uint32_t i = 0; i < n_tokens; i++) {
8563
- n_outputs_new += (int32_t) (ubatch.output[i] != 0);
8564
- }
8543
+ if (n_outputs == n_tokens_all) {
8544
+ n_outputs_new = ubatch.n_tokens;
8545
+ } else {
8546
+ LM_GGML_ASSERT(ubatch.output);
8547
+ for (uint32_t i = 0; i < ubatch.n_tokens; i++) {
8548
+ n_outputs_new += int32_t(ubatch.output[i] != 0);
8565
8549
  }
8550
+ }
8551
+
8552
+ // needs to happen before the graph is built
8553
+ lctx.n_outputs = n_outputs_new;
8554
+ }
8566
8555
 
8567
- // needs to happen before the graph is built
8568
- lctx.n_outputs = n_outputs_new;
8556
+ // non-causal masks do not use the KV cache
8557
+ if (hparams.causal_attn) {
8558
+ llama_kv_cache_update(&lctx);
8559
+
8560
+ // if we have enough unused cells before the current head ->
8561
+ // better to start searching from the beginning of the cache, hoping to fill it
8562
+ if (kv_self.head > kv_self.used + 2*ubatch.n_tokens) {
8563
+ kv_self.head = 0;
8569
8564
  }
8570
8565
 
8571
- int n_threads = n_tokens == 1 ? cparams.n_threads : cparams.n_threads_batch;
8572
- lm_ggml_threadpool_t threadpool = n_tokens == 1 ? lctx.threadpool : lctx.threadpool_batch;
8566
+ const auto slot = llama_kv_cache_find_slot(kv_self, ubatch);
8567
+ if (!slot) {
8568
+ return 1;
8569
+ }
8570
+ kv_slot_restorer.save(slot);
8571
+
8572
+ if (!kv_self.recurrent) {
8573
+ // a heuristic, to avoid attending the full cache if it is not yet utilized
8574
+ // after enough generations, the benefit from this heuristic disappears
8575
+ // if we start defragmenting the cache, the benefit from this will be more important
8576
+ const uint32_t pad = llama_kv_cache_get_padding(cparams);
8577
+ kv_self.n = std::min(kv_self.size, std::max(pad, LM_GGML_PAD(llama_kv_cache_cell_max(kv_self), pad)));
8578
+ //kv_self.n = llama_kv_cache_cell_max(kv_self);
8579
+ }
8580
+ }
8573
8581
 
8574
- LM_GGML_ASSERT(n_threads > 0);
8582
+ return 0;
8583
+ }
8575
8584
 
8576
- // non-causal masks do not use the KV cache
8577
- if (hparams.causal_attn) {
8578
- llama_kv_cache_update(&lctx);
8585
+ // decode a batch of tokens by evaluating the transformer
8586
+ // in case of unsuccessful decoding (error or warning),
8587
+ // the kv_cache state will be returned to its original state
8588
+ // (for non-recurrent models) or cleaned (for recurrent models)
8589
+ //
8590
+ // - lctx: llama context
8591
+ // - inp_batch: batch to evaluate
8592
+ //
8593
+ // return 0 on success
8594
+ // return positive int on warning
8595
+ // return negative int on error
8596
+ //
8597
+ static int llama_decode_impl(
8598
+ llama_context & lctx,
8599
+ llama_batch inp_batch) {
8579
8600
 
8580
- // if we have enough unused cells before the current head ->
8581
- // better to start searching from the beginning of the cache, hoping to fill it
8582
- if (kv_self.head > kv_self.used + 2*n_tokens) {
8583
- kv_self.head = 0;
8584
- }
8601
+ lctx.is_encoding = false;
8585
8602
 
8586
- const auto slot = llama_kv_cache_find_slot(kv_self, ubatch);
8587
- if (!slot) {
8588
- return 1;
8589
- }
8590
- kv_slot_restorer.save(slot);
8603
+ if (inp_batch.n_tokens == 0) {
8604
+ LLAMA_LOG_ERROR("%s: n_tokens == 0\n", __func__);
8605
+ return -1;
8606
+ }
8607
+
8608
+ // temporarily allocate memory for the input batch if needed
8609
+ llama_batch_allocr batch_allocr(inp_batch, inp_batch.pos ? -1 : lctx.kv_self.max_pos() + 1);
8610
+ const llama_batch & batch = batch_allocr.batch;
8611
+
8612
+ const auto & model = lctx.model;
8613
+ const auto & vocab = model.vocab;
8614
+ const auto & hparams = model.hparams;
8615
+ const auto & cparams = lctx.cparams;
8591
8616
 
8592
- if (!kv_self.recurrent) {
8593
- // a heuristic, to avoid attending the full cache if it is not yet utilized
8594
- // after enough generations, the benefit from this heuristic disappears
8595
- // if we start defragmenting the cache, the benefit from this will be more important
8596
- const uint32_t pad = llama_kv_cache_get_padding(cparams);
8597
- kv_self.n = std::min(kv_self.size, std::max(pad, LM_GGML_PAD(llama_kv_cache_cell_max(kv_self), pad)));
8598
- //kv_self.n = llama_kv_cache_cell_max(kv_self);
8617
+ if (lctx.t_compute_start_us == 0) {
8618
+ lctx.t_compute_start_us = lm_ggml_time_us();
8619
+ }
8620
+ auto & kv_self = lctx.kv_self;
8621
+ llama_kv_slot_restorer kv_slot_restorer(kv_self);
8622
+
8623
+ const int64_t n_embd = hparams.n_embd;
8624
+ const int64_t n_vocab = vocab.n_tokens();
8625
+
8626
+ uint32_t n_outputs = 0;
8627
+ uint32_t n_outputs_prev = 0;
8628
+
8629
+ {
8630
+ const int ret = llama_prepare_sbatch(lctx, batch, n_outputs);
8631
+ if (ret != 0) {
8632
+ return ret;
8633
+ }
8634
+ }
8635
+
8636
+ while (lctx.sbatch.n_tokens > 0) {
8637
+ llama_ubatch ubatch;
8638
+ {
8639
+ const int ret = llama_prepare_ubatch(lctx, kv_slot_restorer, ubatch, n_outputs, batch.n_tokens);
8640
+ if (ret != 0) {
8641
+ return ret;
8599
8642
  }
8600
8643
  }
8601
8644
 
8645
+ const int n_threads = ubatch.n_tokens == 1 ? cparams.n_threads : cparams.n_threads_batch;
8646
+ lm_ggml_threadpool_t threadpool = ubatch.n_tokens == 1 ? lctx.threadpool : lctx.threadpool_batch;
8647
+
8648
+ LM_GGML_ASSERT(n_threads > 0);
8649
+
8602
8650
  //printf("kv_self.n = %5d, kv_self.used = %5d, kv_self.head = %5d\n", kv_self.n, kv_self.used, kv_self.head);
8603
8651
 
8604
8652
  lm_ggml_backend_sched_reset(lctx.sched.get());
@@ -8651,7 +8699,7 @@ static int llama_decode_impl(
8651
8699
 
8652
8700
  // update the kv ring buffer
8653
8701
  {
8654
- kv_self.head += n_tokens;
8702
+ kv_self.head += ubatch.n_tokens;
8655
8703
 
8656
8704
  // Ensure kv cache head points to a valid index.
8657
8705
  if (kv_self.head >= kv_self.size) {
@@ -8764,12 +8812,14 @@ static int llama_decode_impl(
8764
8812
  //llama_synchronize(&lctx);
8765
8813
 
8766
8814
  // decide if we need to defrag the kv cache
8767
- if (cparams.causal_attn && cparams.defrag_thold >= 0.0f) {
8768
- const float fragmentation = kv_self.n >= 128 ? 1.0f - float(kv_self.used)/float(kv_self.n) : 0.0f;
8815
+ if (cparams.causal_attn && cparams.defrag_thold > 0.0f) {
8816
+ // - do not defrag small contexts (i.e. < 2048 tokens)
8817
+ // - count the padding towards the number of used tokens
8818
+ const float fragmentation = kv_self.n >= 2048 ? std::max(0.0f, 1.0f - float(kv_self.used + llama_kv_cache_get_padding(cparams))/float(kv_self.n)) : 0.0f;
8769
8819
 
8770
8820
  // queue defragmentation for next llama_kv_cache_update
8771
8821
  if (fragmentation > cparams.defrag_thold) {
8772
- //LLAMA_LOG_INFO("fragmentation: %.2f\n", fragmentation);
8822
+ LLAMA_LOG_DEBUG("%s: fragmentation: %.2f - requesting defrag\n", __func__, fragmentation);
8773
8823
 
8774
8824
  llama_kv_cache_defrag(kv_self);
8775
8825
  }
@@ -9391,8 +9441,6 @@ static struct llama_model * llama_model_load_from_file_impl(
9391
9441
  struct llama_model_params params) {
9392
9442
  lm_ggml_time_init();
9393
9443
 
9394
- llama_model * model = new llama_model(params);
9395
-
9396
9444
  unsigned cur_percentage = 0;
9397
9445
  if (params.progress_callback == NULL) {
9398
9446
  params.progress_callback_user_data = &cur_percentage;
@@ -9410,12 +9458,15 @@ static struct llama_model * llama_model_load_from_file_impl(
9410
9458
  };
9411
9459
  }
9412
9460
 
9461
+ llama_model * model = new llama_model(params);
9462
+
9413
9463
  // create list of devices to use with this model
9414
9464
  if (params.devices) {
9415
9465
  for (lm_ggml_backend_dev_t * dev = params.devices; *dev; ++dev) {
9416
9466
  model->devices.push_back(*dev);
9417
9467
  }
9418
9468
  } else {
9469
+ std::vector<lm_ggml_backend_dev_t> rpc_servers;
9419
9470
  // use all available devices
9420
9471
  for (size_t i = 0; i < lm_ggml_backend_dev_count(); ++i) {
9421
9472
  lm_ggml_backend_dev_t dev = lm_ggml_backend_dev_get(i);
@@ -9426,10 +9477,19 @@ static struct llama_model * llama_model_load_from_file_impl(
9426
9477
  break;
9427
9478
 
9428
9479
  case LM_GGML_BACKEND_DEVICE_TYPE_GPU:
9429
- model->devices.push_back(dev);
9480
+ lm_ggml_backend_reg_t reg = lm_ggml_backend_dev_backend_reg(dev);
9481
+ if (lm_ggml_backend_reg_name(reg) == std::string("RPC")) {
9482
+ rpc_servers.push_back(dev);
9483
+ } else {
9484
+ model->devices.push_back(dev);
9485
+ }
9430
9486
  break;
9431
9487
  }
9432
9488
  }
9489
+ // add RPC servers at the front of the list
9490
+ if (!rpc_servers.empty()) {
9491
+ model->devices.insert(model->devices.begin(), rpc_servers.begin(), rpc_servers.end());
9492
+ }
9433
9493
  }
9434
9494
 
9435
9495
  // if using single GPU mode, remove all except the main GPU
package/cpp/llama.h CHANGED
@@ -214,7 +214,7 @@ extern "C" {
214
214
  LLAMA_SPLIT_MODE_ROW = 2, // split layers and KV across GPUs, use tensor parallelism if supported
215
215
  };
216
216
 
217
- // TODO: simplify (https://github.com/ggerganov/llama.cpp/pull/9294#pullrequestreview-2286561979)
217
+ // TODO: simplify (https://github.com/ggml-org/llama.cpp/pull/9294#pullrequestreview-2286561979)
218
218
  typedef struct llama_token_data {
219
219
  llama_token id; // token id
220
220
  float logit; // log-odds of the token
@@ -308,7 +308,7 @@ extern "C" {
308
308
  };
309
309
 
310
310
  // NOTE: changing the default values of parameters marked as [EXPERIMENTAL] may cause crashes or incorrect results in certain configurations
311
- // https://github.com/ggerganov/llama.cpp/pull/7544
311
+ // https://github.com/ggml-org/llama.cpp/pull/7544
312
312
  struct llama_context_params {
313
313
  uint32_t n_ctx; // text context, 0 = from model
314
314
  uint32_t n_batch; // logical maximum batch size that can be submitted to llama_decode
@@ -321,7 +321,7 @@ extern "C" {
321
321
  enum llama_pooling_type pooling_type; // whether to pool (sum) embedding results by sequence id
322
322
  enum llama_attention_type attention_type; // attention type to use for embeddings
323
323
 
324
- // ref: https://github.com/ggerganov/llama.cpp/pull/2054
324
+ // ref: https://github.com/ggml-org/llama.cpp/pull/2054
325
325
  float rope_freq_base; // RoPE base frequency, 0 = from model
326
326
  float rope_freq_scale; // RoPE frequency scaling factor, 0 = from model
327
327
  float yarn_ext_factor; // YaRN extrapolation mix factor, negative = from model
@@ -386,7 +386,7 @@ extern "C" {
386
386
  struct llama_adapter_lora;
387
387
 
388
388
  // Helpers for getting default parameters
389
- // TODO: update API to start accepting pointers to params structs (https://github.com/ggerganov/llama.cpp/discussions/9172)
389
+ // TODO: update API to start accepting pointers to params structs (https://github.com/ggml-org/llama.cpp/discussions/9172)
390
390
  LLAMA_API struct llama_model_params llama_model_default_params(void);
391
391
  LLAMA_API struct llama_context_params llama_context_default_params(void);
392
392
  LLAMA_API struct llama_sampler_chain_params llama_sampler_chain_default_params(void);
@@ -511,7 +511,8 @@ extern "C" {
511
511
  LLAMA_API uint64_t llama_model_size(const struct llama_model * model);
512
512
 
513
513
  // Get the default chat template. Returns nullptr if not available
514
- LLAMA_API const char * llama_model_chat_template(const struct llama_model * model);
514
+ // If name is NULL, returns the default chat template
515
+ LLAMA_API const char * llama_model_chat_template(const struct llama_model * model, const char * name);
515
516
 
516
517
  // Returns the total number of parameters in the model
517
518
  LLAMA_API uint64_t llama_model_n_params(const struct llama_model * model);
@@ -1040,7 +1041,7 @@ extern "C" {
1040
1041
 
1041
1042
  /// Apply chat template. Inspired by hf apply_chat_template() on python.
1042
1043
  /// Both "model" and "custom_template" are optional, but at least one is required. "custom_template" has higher precedence than "model"
1043
- /// NOTE: This function does not use a jinja parser. It only support a pre-defined list of template. See more: https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template
1044
+ /// NOTE: This function does not use a jinja parser. It only support a pre-defined list of template. See more: https://github.com/ggml-org/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template
1044
1045
  /// @param tmpl A Jinja template to use for this chat. If this is nullptr, the model’s default chat template will be used instead.
1045
1046
  /// @param chat Pointer to a list of multiple llama_chat_message
1046
1047
  /// @param n_msg Number of llama_chat_message in this chat
@@ -1114,11 +1115,12 @@ extern "C" {
1114
1115
  };
1115
1116
 
1116
1117
  struct llama_sampler {
1117
- struct llama_sampler_i * iface;
1118
- llama_sampler_context_t ctx;
1118
+ const struct llama_sampler_i * iface;
1119
+ llama_sampler_context_t ctx;
1119
1120
  };
1120
1121
 
1121
1122
  // mirror of llama_sampler_i:
1123
+ LLAMA_API struct llama_sampler * llama_sampler_init (const struct llama_sampler_i * iface, llama_sampler_context_t ctx);
1122
1124
  LLAMA_API const char * llama_sampler_name (const struct llama_sampler * smpl);
1123
1125
  LLAMA_API void llama_sampler_accept( struct llama_sampler * smpl, llama_token token);
1124
1126
  LLAMA_API void llama_sampler_apply ( struct llama_sampler * smpl, llama_token_data_array * cur_p);
@@ -1148,7 +1150,7 @@ extern "C" {
1148
1150
  /// @details Sorts candidate tokens by their logits in descending order and calculate probabilities based on logits.
1149
1151
  /// NOTE: Avoid using on the full vocabulary as the sorting can become slow. For example, apply top-k or top-p sampling first.
1150
1152
  DEPRECATED(LLAMA_API struct llama_sampler * llama_sampler_init_softmax (void),
1151
- "will be removed in the future (see https://github.com/ggerganov/llama.cpp/pull/9896#discussion_r1800920915)");
1153
+ "will be removed in the future (see https://github.com/ggml-org/llama.cpp/pull/9896#discussion_r1800920915)");
1152
1154
 
1153
1155
  /// @details Top-K sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
1154
1156
  LLAMA_API struct llama_sampler * llama_sampler_init_top_k (int32_t k);
@@ -1156,7 +1158,7 @@ extern "C" {
1156
1158
  /// @details Nucleus sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
1157
1159
  LLAMA_API struct llama_sampler * llama_sampler_init_top_p (float p, size_t min_keep);
1158
1160
 
1159
- /// @details Minimum P sampling as described in https://github.com/ggerganov/llama.cpp/pull/3841
1161
+ /// @details Minimum P sampling as described in https://github.com/ggml-org/llama.cpp/pull/3841
1160
1162
  LLAMA_API struct llama_sampler * llama_sampler_init_min_p (float p, size_t min_keep);
1161
1163
 
1162
1164
  /// @details Locally Typical Sampling implementation described in the paper https://arxiv.org/abs/2202.00666.
@@ -1171,6 +1173,9 @@ extern "C" {
1171
1173
  /// @details XTC sampler as described in https://github.com/oobabooga/text-generation-webui/pull/6335
1172
1174
  LLAMA_API struct llama_sampler * llama_sampler_init_xtc (float p, float t, size_t min_keep, uint32_t seed);
1173
1175
 
1176
+ /// @details Top n sigma sampling as described in academic paper "Top-nσ: Not All Logits Are You Need" https://arxiv.org/pdf/2411.07641
1177
+ LLAMA_API struct llama_sampler * llama_sampler_init_top_n_sigma(float n);
1178
+
1174
1179
  /// @details Mirostat 1.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words.
1175
1180
  /// @param candidates A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text.
1176
1181
  /// @param tau The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text.
@@ -1199,6 +1204,18 @@ extern "C" {
1199
1204
  const char * grammar_str,
1200
1205
  const char * grammar_root);
1201
1206
 
1207
+ /// @details Lazy grammar sampler, introduced in https://github.com/ggml-org/llama.cpp/pull/9639
1208
+ /// @param trigger_words A list of words that will trigger the grammar sampler. This may be updated to a loose regex syntax (w/ ^) in a near future.
1209
+ /// @param trigger_tokens A list of tokens that will trigger the grammar sampler.
1210
+ LLAMA_API struct llama_sampler * llama_sampler_init_grammar_lazy(
1211
+ const struct llama_vocab * vocab,
1212
+ const char * grammar_str,
1213
+ const char * grammar_root,
1214
+ const char ** trigger_words,
1215
+ size_t num_trigger_words,
1216
+ const llama_token * trigger_tokens,
1217
+ size_t num_trigger_tokens);
1218
+
1202
1219
  /// NOTE: Avoid using on the full vocabulary as searching for repeated tokens can become slow. For example, apply top-k or top-p sampling first.
1203
1220
  LLAMA_API struct llama_sampler * llama_sampler_init_penalties(
1204
1221
  int32_t penalty_last_n, // last n tokens to penalize (0 = disable penalty, -1 = context size)
package/cpp/log.cpp CHANGED
@@ -1,5 +1,6 @@
1
1
  #include "log.h"
2
2
 
3
+ #include <chrono>
3
4
  #include <condition_variable>
4
5
  #include <cstdarg>
5
6
  #include <cstdio>
@@ -8,22 +9,16 @@
8
9
  #include <thread>
9
10
  #include <vector>
10
11
 
12
+ #if defined(__ANDROID__) && defined(RNLLAMA_ANDROID_ENABLE_LOGGING)
13
+ #include <android/log.h>
14
+ #endif
15
+
11
16
  int common_log_verbosity_thold = LOG_DEFAULT_LLAMA;
12
17
 
13
18
  void common_log_set_verbosity_thold(int verbosity) {
14
19
  common_log_verbosity_thold = verbosity;
15
20
  }
16
21
 
17
- #define LOG_COL_DEFAULT "\033[0m"
18
- #define LOG_COL_BOLD "\033[1m"
19
- #define LOG_COL_RED "\033[31m"
20
- #define LOG_COL_GREEN "\033[32m"
21
- #define LOG_COL_YELLOW "\033[33m"
22
- #define LOG_COL_BLUE "\033[34m"
23
- #define LOG_COL_MAGENTA "\033[35m"
24
- #define LOG_COL_CYAN "\033[36m"
25
- #define LOG_COL_WHITE "\033[37m"
26
-
27
22
  static int64_t t_us() {
28
23
  return std::chrono::duration_cast<std::chrono::microseconds>(std::chrono::system_clock::now().time_since_epoch()).count();
29
24
  }
@@ -66,6 +61,32 @@ struct common_log_entry {
66
61
  // signals the worker thread to stop
67
62
  bool is_end;
68
63
 
64
+ #if defined(__ANDROID__) && defined(RNLLAMA_ANDROID_ENABLE_LOGGING)
65
+ void android_print() const {
66
+ int android_log_priority;
67
+ switch (level) {
68
+ case LM_GGML_LOG_LEVEL_INFO:
69
+ android_log_priority = ANDROID_LOG_INFO;
70
+ break;
71
+ case LM_GGML_LOG_LEVEL_WARN:
72
+ android_log_priority = ANDROID_LOG_WARN;
73
+ break;
74
+ case LM_GGML_LOG_LEVEL_ERROR:
75
+ android_log_priority = ANDROID_LOG_ERROR;
76
+ break;
77
+ case LM_GGML_LOG_LEVEL_DEBUG:
78
+ android_log_priority = ANDROID_LOG_DEBUG;
79
+ break;
80
+ default:
81
+ android_log_priority = ANDROID_LOG_DEFAULT;
82
+ break;
83
+ }
84
+
85
+ const char * tag = "RNLLAMA_LOG_ANDROID";
86
+ __android_log_print(android_log_priority, tag, "%s", msg.data());
87
+ }
88
+ #endif
89
+
69
90
  void print(FILE * file = nullptr) const {
70
91
  FILE * fcur = file;
71
92
  if (!fcur) {
@@ -206,6 +227,7 @@ public:
206
227
  vsnprintf(entry.msg.data(), entry.msg.size(), ss.str().c_str(), args_copy);
207
228
  }
208
229
  #endif
230
+ va_end(args_copy);
209
231
  }
210
232
 
211
233
  entry.level = level;
package/cpp/log.h CHANGED
@@ -2,9 +2,20 @@
2
2
 
3
3
  #include "ggml.h" // for lm_ggml_log_level
4
4
 
5
+ #define LOG_CLR_TO_EOL "\033[K\r"
6
+ #define LOG_COL_DEFAULT "\033[0m"
7
+ #define LOG_COL_BOLD "\033[1m"
8
+ #define LOG_COL_RED "\033[31m"
9
+ #define LOG_COL_GREEN "\033[32m"
10
+ #define LOG_COL_YELLOW "\033[33m"
11
+ #define LOG_COL_BLUE "\033[34m"
12
+ #define LOG_COL_MAGENTA "\033[35m"
13
+ #define LOG_COL_CYAN "\033[36m"
14
+ #define LOG_COL_WHITE "\033[37m"
15
+
5
16
  #ifndef __GNUC__
6
17
  # define LOG_ATTRIBUTE_FORMAT(...)
7
- #elif defined(__MINGW32__)
18
+ #elif defined(__MINGW32__) && !defined(__clang__)
8
19
  # define LOG_ATTRIBUTE_FORMAT(...) __attribute__((format(gnu_printf, __VA_ARGS__)))
9
20
  #else
10
21
  # define LOG_ATTRIBUTE_FORMAT(...) __attribute__((format(printf, __VA_ARGS__)))