cui-llama.rn 1.4.4 → 1.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (216) hide show
  1. package/android/src/main/CMakeLists.txt +9 -2
  2. package/android/src/main/jni.cpp +54 -34
  3. package/android/src/main/jniLibs/arm64-v8a/librnllama.so +0 -0
  4. package/android/src/main/jniLibs/arm64-v8a/librnllama_v8.so +0 -0
  5. package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2.so +0 -0
  6. package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2_dotprod.so +0 -0
  7. package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2_dotprod_i8mm.so +0 -0
  8. package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2_i8mm.so +0 -0
  9. package/android/src/main/jniLibs/x86_64/librnllama.so +0 -0
  10. package/android/src/main/jniLibs/x86_64/librnllama_x86_64.so +0 -0
  11. package/cpp/binary-ops.cpp +158 -0
  12. package/cpp/binary-ops.h +16 -0
  13. package/cpp/chat.cpp +1769 -1085
  14. package/cpp/chat.h +143 -0
  15. package/cpp/common.cpp +1562 -1996
  16. package/cpp/common.h +677 -744
  17. package/cpp/cpu-common.h +72 -0
  18. package/cpp/ggml-alloc.c +1039 -1030
  19. package/cpp/ggml-alloc.h +1 -1
  20. package/cpp/ggml-backend-impl.h +255 -255
  21. package/cpp/ggml-backend-reg.cpp +586 -582
  22. package/cpp/ggml-backend.cpp +2004 -2002
  23. package/cpp/ggml-backend.h +354 -354
  24. package/cpp/ggml-common.h +1857 -1851
  25. package/cpp/ggml-cpp.h +39 -39
  26. package/cpp/ggml-cpu-aarch64.cpp +5725 -4247
  27. package/cpp/ggml-cpu-aarch64.h +8 -8
  28. package/cpp/ggml-cpu-impl.h +512 -380
  29. package/cpp/ggml-cpu-quants.c +13026 -11517
  30. package/cpp/ggml-cpu-traits.cpp +36 -36
  31. package/cpp/ggml-cpu-traits.h +38 -38
  32. package/cpp/ggml-cpu.c +3438 -14485
  33. package/cpp/ggml-cpu.cpp +655 -633
  34. package/cpp/ggml-cpu.h +138 -135
  35. package/cpp/ggml-impl.h +594 -567
  36. package/cpp/ggml-metal-impl.h +312 -3
  37. package/cpp/ggml-metal.h +66 -66
  38. package/cpp/ggml-metal.m +5360 -5002
  39. package/cpp/ggml-opt.cpp +854 -854
  40. package/cpp/ggml-opt.h +216 -216
  41. package/cpp/ggml-quants.c +5238 -5238
  42. package/cpp/ggml-threading.h +14 -14
  43. package/cpp/ggml.c +6618 -6524
  44. package/cpp/ggml.h +2222 -2194
  45. package/cpp/gguf.cpp +1330 -1329
  46. package/cpp/gguf.h +202 -202
  47. package/cpp/json-schema-to-grammar.cpp +1024 -1025
  48. package/cpp/json-schema-to-grammar.h +21 -22
  49. package/cpp/json.hpp +24766 -24766
  50. package/cpp/llama-adapter.cpp +382 -347
  51. package/cpp/llama-adapter.h +76 -74
  52. package/cpp/llama-arch.cpp +1714 -1492
  53. package/cpp/llama-arch.h +428 -402
  54. package/cpp/llama-batch.cpp +368 -368
  55. package/cpp/llama-batch.h +88 -88
  56. package/cpp/llama-chat.cpp +640 -587
  57. package/cpp/llama-chat.h +56 -53
  58. package/cpp/llama-context.cpp +2831 -1775
  59. package/cpp/llama-context.h +265 -128
  60. package/cpp/llama-cparams.cpp +1 -1
  61. package/cpp/llama-cparams.h +38 -37
  62. package/cpp/llama-cpp.h +30 -30
  63. package/cpp/llama-grammar.cpp +1219 -1219
  64. package/cpp/llama-grammar.h +173 -164
  65. package/cpp/llama-graph.cpp +1695 -0
  66. package/cpp/llama-graph.h +592 -0
  67. package/cpp/llama-hparams.cpp +79 -71
  68. package/cpp/llama-hparams.h +156 -139
  69. package/cpp/llama-impl.cpp +167 -167
  70. package/cpp/llama-impl.h +61 -61
  71. package/cpp/llama-io.cpp +15 -0
  72. package/cpp/llama-io.h +35 -0
  73. package/cpp/llama-kv-cache.cpp +1380 -718
  74. package/cpp/llama-kv-cache.h +213 -218
  75. package/cpp/llama-memory.cpp +1 -0
  76. package/cpp/llama-memory.h +21 -0
  77. package/cpp/llama-mmap.cpp +600 -590
  78. package/cpp/llama-mmap.h +68 -68
  79. package/cpp/llama-model-loader.cpp +1129 -1124
  80. package/cpp/llama-model-loader.h +169 -167
  81. package/cpp/llama-model.cpp +13080 -4023
  82. package/cpp/llama-model.h +409 -370
  83. package/cpp/llama-sampling.cpp +2563 -2525
  84. package/cpp/llama-sampling.h +32 -32
  85. package/cpp/llama-vocab.cpp +3295 -3252
  86. package/cpp/llama-vocab.h +125 -125
  87. package/cpp/llama.cpp +351 -10137
  88. package/cpp/llama.h +1434 -1340
  89. package/cpp/log.cpp +427 -423
  90. package/cpp/log.h +132 -132
  91. package/cpp/{chat-template.hpp → minja/chat-template.hpp} +537 -529
  92. package/cpp/{minja.hpp → minja/minja.hpp} +2941 -2883
  93. package/cpp/ops.cpp +8723 -0
  94. package/cpp/ops.h +128 -0
  95. package/cpp/rn-llama.cpp +45 -71
  96. package/cpp/rn-llama.h +3 -3
  97. package/cpp/sampling.cpp +573 -532
  98. package/cpp/sgemm.cpp +3043 -2598
  99. package/cpp/sgemm.h +14 -14
  100. package/cpp/simd-mappings.h +888 -0
  101. package/cpp/speculative.cpp +278 -277
  102. package/cpp/speculative.h +28 -28
  103. package/cpp/unary-ops.cpp +186 -0
  104. package/cpp/unary-ops.h +28 -0
  105. package/cpp/vec.cpp +258 -0
  106. package/cpp/vec.h +802 -0
  107. package/ios/CMakeLists.txt +5 -2
  108. package/ios/RNLlama.mm +2 -2
  109. package/ios/RNLlamaContext.mm +40 -24
  110. package/package.json +1 -1
  111. package/src/NativeRNLlama.ts +6 -4
  112. package/src/index.ts +3 -1
  113. package/android/src/main/build-arm64/CMakeCache.txt +0 -429
  114. package/android/src/main/build-arm64/CMakeFiles/3.31.4/CMakeCCompiler.cmake +0 -81
  115. package/android/src/main/build-arm64/CMakeFiles/3.31.4/CMakeCXXCompiler.cmake +0 -101
  116. package/android/src/main/build-arm64/CMakeFiles/3.31.4/CMakeDetermineCompilerABI_C.bin +0 -0
  117. package/android/src/main/build-arm64/CMakeFiles/3.31.4/CMakeDetermineCompilerABI_CXX.bin +0 -0
  118. package/android/src/main/build-arm64/CMakeFiles/3.31.4/CMakeSystem.cmake +0 -15
  119. package/android/src/main/build-arm64/CMakeFiles/3.31.4/CompilerIdC/CMakeCCompilerId.c +0 -904
  120. package/android/src/main/build-arm64/CMakeFiles/3.31.4/CompilerIdC/CMakeCCompilerId.o +0 -0
  121. package/android/src/main/build-arm64/CMakeFiles/3.31.4/CompilerIdCXX/CMakeCXXCompilerId.cpp +0 -919
  122. package/android/src/main/build-arm64/CMakeFiles/3.31.4/CompilerIdCXX/CMakeCXXCompilerId.o +0 -0
  123. package/android/src/main/build-arm64/CMakeFiles/CMakeConfigureLog.yaml +0 -431
  124. package/android/src/main/build-arm64/CMakeFiles/CMakeDirectoryInformation.cmake +0 -16
  125. package/android/src/main/build-arm64/CMakeFiles/Makefile.cmake +0 -165
  126. package/android/src/main/build-arm64/CMakeFiles/Makefile2 +0 -297
  127. package/android/src/main/build-arm64/CMakeFiles/Progress/1 +0 -1
  128. package/android/src/main/build-arm64/CMakeFiles/Progress/2 +0 -1
  129. package/android/src/main/build-arm64/CMakeFiles/Progress/3 +0 -1
  130. package/android/src/main/build-arm64/CMakeFiles/Progress/4 +0 -1
  131. package/android/src/main/build-arm64/CMakeFiles/Progress/5 +0 -1
  132. package/android/src/main/build-arm64/CMakeFiles/Progress/6 +0 -1
  133. package/android/src/main/build-arm64/CMakeFiles/Progress/count.txt +0 -1
  134. package/android/src/main/build-arm64/CMakeFiles/TargetDirectories.txt +0 -8
  135. package/android/src/main/build-arm64/CMakeFiles/cmake.check_cache +0 -1
  136. package/android/src/main/build-arm64/CMakeFiles/progress.marks +0 -1
  137. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-alloc.c.o +0 -0
  138. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-alloc.c.o.d +0 -58
  139. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-backend-reg.cpp.o +0 -0
  140. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-backend-reg.cpp.o.d +0 -756
  141. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-backend.cpp.o +0 -0
  142. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-backend.cpp.o.d +0 -709
  143. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-cpu-aarch64.cpp.o +0 -0
  144. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-cpu-aarch64.cpp.o.d +0 -714
  145. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-cpu-quants.c.o +0 -0
  146. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-cpu-quants.c.o.d +0 -62
  147. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-cpu-traits.cpp.o +0 -0
  148. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-cpu-traits.cpp.o.d +0 -708
  149. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-cpu.c.o +0 -0
  150. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-cpu.c.o.d +0 -113
  151. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-cpu.cpp.o +0 -0
  152. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-cpu.cpp.o.d +0 -713
  153. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-opt.cpp.o +0 -0
  154. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-opt.cpp.o.d +0 -763
  155. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-quants.c.o +0 -0
  156. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-quants.c.o.d +0 -61
  157. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-threading.cpp.o +0 -0
  158. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-threading.cpp.o.d +0 -707
  159. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml.c.o +0 -0
  160. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml.c.o.d +0 -104
  161. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/gguf.cpp.o +0 -0
  162. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/gguf.cpp.o.d +0 -714
  163. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/log.cpp.o +0 -0
  164. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/log.cpp.o.d +0 -723
  165. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/DependInfo.cmake +0 -62
  166. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/build.make +0 -722
  167. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/cmake_clean.cmake +0 -89
  168. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/compiler_depend.make +0 -2
  169. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/compiler_depend.ts +0 -2
  170. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/depend.make +0 -2
  171. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/flags.make +0 -17
  172. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/progress.make +0 -41
  173. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8.dir/DependInfo.cmake +0 -62
  174. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8.dir/build.make +0 -722
  175. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8.dir/cmake_clean.cmake +0 -89
  176. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8.dir/compiler_depend.make +0 -2
  177. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8.dir/compiler_depend.ts +0 -2
  178. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8.dir/depend.make +0 -2
  179. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8.dir/flags.make +0 -17
  180. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8.dir/progress.make +0 -41
  181. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2.dir/DependInfo.cmake +0 -62
  182. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2.dir/build.make +0 -722
  183. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2.dir/cmake_clean.cmake +0 -89
  184. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2.dir/compiler_depend.make +0 -2
  185. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2.dir/compiler_depend.ts +0 -2
  186. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2.dir/depend.make +0 -2
  187. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2.dir/flags.make +0 -17
  188. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2.dir/progress.make +0 -41
  189. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_dotprod.dir/DependInfo.cmake +0 -62
  190. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_dotprod.dir/build.make +0 -722
  191. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_dotprod.dir/cmake_clean.cmake +0 -89
  192. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_dotprod.dir/compiler_depend.make +0 -2
  193. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_dotprod.dir/compiler_depend.ts +0 -2
  194. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_dotprod.dir/depend.make +0 -2
  195. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_dotprod.dir/flags.make +0 -17
  196. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_dotprod.dir/progress.make +0 -41
  197. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_dotprod_i8mm.dir/DependInfo.cmake +0 -62
  198. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_dotprod_i8mm.dir/build.make +0 -722
  199. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_dotprod_i8mm.dir/cmake_clean.cmake +0 -89
  200. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_dotprod_i8mm.dir/compiler_depend.make +0 -2
  201. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_dotprod_i8mm.dir/compiler_depend.ts +0 -2
  202. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_dotprod_i8mm.dir/depend.make +0 -2
  203. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_dotprod_i8mm.dir/flags.make +0 -17
  204. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_dotprod_i8mm.dir/progress.make +0 -41
  205. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_i8mm.dir/DependInfo.cmake +0 -62
  206. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_i8mm.dir/build.make +0 -722
  207. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_i8mm.dir/cmake_clean.cmake +0 -89
  208. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_i8mm.dir/compiler_depend.make +0 -2
  209. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_i8mm.dir/compiler_depend.ts +0 -2
  210. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_i8mm.dir/depend.make +0 -2
  211. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_i8mm.dir/flags.make +0 -17
  212. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_i8mm.dir/progress.make +0 -41
  213. package/android/src/main/build-arm64/Makefile +0 -1862
  214. package/android/src/main/build-arm64/cmake_install.cmake +0 -66
  215. package/cpp/chat.hpp +0 -55
  216. package/cpp/rn-llama.hpp +0 -913
package/cpp/ops.h ADDED
@@ -0,0 +1,128 @@
1
+ #pragma once
2
+
3
+ #include "ggml.h"
4
+
5
+ //
6
+ // cache line
7
+ //
8
+
9
+ #if defined(__cpp_lib_hardware_interference_size)
10
+ #define CACHE_LINE_SIZE std::hardware_destructive_interference_size
11
+ #else
12
+ #if defined(__POWER9_VECTOR__)
13
+ #define CACHE_LINE_SIZE 128
14
+ #elif defined(__VXE__) || defined(__VXE2__)
15
+ #define CACHE_LINE_SIZE 256
16
+ #else
17
+ #define CACHE_LINE_SIZE 64
18
+ #endif
19
+ #endif
20
+
21
+ static const size_t CACHE_LINE_SIZE_F32 = CACHE_LINE_SIZE/sizeof(float);
22
+
23
+ #ifdef __cplusplus
24
+ extern "C" {
25
+ #endif
26
+
27
+ void lm_ggml_compute_forward_dup(const struct lm_ggml_compute_params * params, struct lm_ggml_tensor * dst);
28
+ void lm_ggml_compute_forward_add(const struct lm_ggml_compute_params * params, struct lm_ggml_tensor * dst);
29
+ void lm_ggml_compute_forward_add1(const struct lm_ggml_compute_params * params, struct lm_ggml_tensor * dst);
30
+ void lm_ggml_compute_forward_acc(const struct lm_ggml_compute_params * params, struct lm_ggml_tensor * dst);
31
+ void lm_ggml_compute_forward_sum(const struct lm_ggml_compute_params * params, struct lm_ggml_tensor * dst);
32
+ void lm_ggml_compute_forward_sum_rows(const struct lm_ggml_compute_params * params, struct lm_ggml_tensor * dst);
33
+ void lm_ggml_compute_forward_mean(const struct lm_ggml_compute_params * params, struct lm_ggml_tensor * dst);
34
+ void lm_ggml_compute_forward_argmax(const struct lm_ggml_compute_params * params, struct lm_ggml_tensor * dst);
35
+ void lm_ggml_compute_forward_count_equal(const struct lm_ggml_compute_params * params, struct lm_ggml_tensor * dst);
36
+ void lm_ggml_compute_forward_repeat(const struct lm_ggml_compute_params * params, struct lm_ggml_tensor * dst);
37
+ void lm_ggml_compute_forward_repeat_back(const struct lm_ggml_compute_params * params, struct lm_ggml_tensor * dst);
38
+ void lm_ggml_compute_forward_concat(const struct lm_ggml_compute_params * params, struct lm_ggml_tensor * dst);
39
+ void lm_ggml_compute_forward_silu_back(const struct lm_ggml_compute_params * params, struct lm_ggml_tensor * dst);
40
+ void lm_ggml_compute_forward_norm(const struct lm_ggml_compute_params * params, struct lm_ggml_tensor * dst);
41
+ void lm_ggml_compute_forward_rms_norm(const struct lm_ggml_compute_params * params, struct lm_ggml_tensor * dst);
42
+ void lm_ggml_compute_forward_rms_norm_back(const struct lm_ggml_compute_params * params, struct lm_ggml_tensor * dst);
43
+ void lm_ggml_compute_forward_group_norm(const struct lm_ggml_compute_params * params, struct lm_ggml_tensor * dst);
44
+ void lm_ggml_compute_forward_l2_norm(const struct lm_ggml_compute_params * params, struct lm_ggml_tensor * dst);
45
+ void lm_ggml_compute_forward_out_prod(const struct lm_ggml_compute_params * params, struct lm_ggml_tensor * dst);
46
+ void lm_ggml_compute_forward_scale(const struct lm_ggml_compute_params * params, struct lm_ggml_tensor * dst);
47
+ void lm_ggml_compute_forward_set(const struct lm_ggml_compute_params * params, struct lm_ggml_tensor * dst);
48
+ void lm_ggml_compute_forward_cpy(const struct lm_ggml_compute_params * params, struct lm_ggml_tensor * dst);
49
+ void lm_ggml_compute_forward_cont(const struct lm_ggml_compute_params * params, struct lm_ggml_tensor * dst);
50
+ void lm_ggml_compute_forward_reshape(const struct lm_ggml_compute_params * params, struct lm_ggml_tensor * dst);
51
+ void lm_ggml_compute_forward_view(const struct lm_ggml_compute_params * params, struct lm_ggml_tensor * dst);
52
+ void lm_ggml_compute_forward_permute(const struct lm_ggml_compute_params * params, struct lm_ggml_tensor * dst);
53
+ void lm_ggml_compute_forward_transpose(const struct lm_ggml_compute_params * params, struct lm_ggml_tensor * dst);
54
+ void lm_ggml_compute_forward_get_rows(const struct lm_ggml_compute_params * params, struct lm_ggml_tensor * dst);
55
+ void lm_ggml_compute_forward_get_rows_back(const struct lm_ggml_compute_params * params, struct lm_ggml_tensor * dst);
56
+ void lm_ggml_compute_forward_diag(const struct lm_ggml_compute_params * params, struct lm_ggml_tensor * dst);
57
+ void lm_ggml_compute_forward_diag_mask_inf(const struct lm_ggml_compute_params * params, struct lm_ggml_tensor * dst);
58
+ void lm_ggml_compute_forward_diag_mask_zero(const struct lm_ggml_compute_params * params, struct lm_ggml_tensor * dst);
59
+ void lm_ggml_compute_forward_soft_max(const struct lm_ggml_compute_params * params, struct lm_ggml_tensor * dst);
60
+ void lm_ggml_compute_forward_soft_max_ext_back(const struct lm_ggml_compute_params * params, struct lm_ggml_tensor * dst);
61
+ void lm_ggml_compute_forward_rope(const struct lm_ggml_compute_params * params, struct lm_ggml_tensor * dst);
62
+ void lm_ggml_compute_forward_rope_back(const struct lm_ggml_compute_params * params, struct lm_ggml_tensor * dst);
63
+ void lm_ggml_compute_forward_clamp(const struct lm_ggml_compute_params * params, struct lm_ggml_tensor * dst);
64
+ void lm_ggml_compute_forward_conv_transpose_1d(const struct lm_ggml_compute_params * params, struct lm_ggml_tensor * dst);
65
+ void lm_ggml_compute_forward_im2col(const struct lm_ggml_compute_params * params, struct lm_ggml_tensor * dst);
66
+ void lm_ggml_compute_forward_im2col_back_f32(const struct lm_ggml_compute_params * params, struct lm_ggml_tensor * dst);
67
+ void lm_ggml_compute_forward_conv_transpose_2d(const struct lm_ggml_compute_params * params, struct lm_ggml_tensor * dst);
68
+ void lm_ggml_compute_forward_pool_1d(const struct lm_ggml_compute_params * params, struct lm_ggml_tensor * dst);
69
+ void lm_ggml_compute_forward_pool_2d(const struct lm_ggml_compute_params * params, struct lm_ggml_tensor * dst);
70
+ void lm_ggml_compute_forward_pool_2d_back(const struct lm_ggml_compute_params * params, struct lm_ggml_tensor * dst);
71
+ void lm_ggml_compute_forward_upscale(const struct lm_ggml_compute_params * params, struct lm_ggml_tensor * dst);
72
+ void lm_ggml_compute_forward_pad(const struct lm_ggml_compute_params * params, struct lm_ggml_tensor * dst);
73
+ void lm_ggml_compute_forward_pad_reflect_1d(const struct lm_ggml_compute_params * params, struct lm_ggml_tensor * dst);
74
+ void lm_ggml_compute_forward_arange(const struct lm_ggml_compute_params * params, struct lm_ggml_tensor * dst);
75
+ void lm_ggml_compute_forward_timestep_embedding(const struct lm_ggml_compute_params * params, struct lm_ggml_tensor * dst);
76
+ void lm_ggml_compute_forward_argsort(const struct lm_ggml_compute_params * params, struct lm_ggml_tensor * dst);
77
+ void lm_ggml_compute_forward_leaky_relu(const struct lm_ggml_compute_params * params, struct lm_ggml_tensor * dst);
78
+ void lm_ggml_compute_forward_flash_attn_ext(
79
+ const struct lm_ggml_compute_params * params,
80
+ const struct lm_ggml_tensor * q,
81
+ const struct lm_ggml_tensor * k,
82
+ const struct lm_ggml_tensor * v,
83
+ const struct lm_ggml_tensor * mask,
84
+ struct lm_ggml_tensor * dst);
85
+ void lm_ggml_compute_forward_flash_attn_back(
86
+ const struct lm_ggml_compute_params * params,
87
+ const bool masked,
88
+ struct lm_ggml_tensor * dst);
89
+ void lm_ggml_compute_forward_ssm_conv(const struct lm_ggml_compute_params * params, struct lm_ggml_tensor * dst);
90
+ void lm_ggml_compute_forward_ssm_scan(const struct lm_ggml_compute_params * params, struct lm_ggml_tensor * dst);
91
+ void lm_ggml_compute_forward_win_part(const struct lm_ggml_compute_params * params, struct lm_ggml_tensor * dst);
92
+ void lm_ggml_compute_forward_win_unpart(const struct lm_ggml_compute_params * params, struct lm_ggml_tensor * dst);
93
+ void lm_ggml_compute_forward_unary(const struct lm_ggml_compute_params * params, struct lm_ggml_tensor * dst);
94
+ void lm_ggml_compute_forward_get_rel_pos(const struct lm_ggml_compute_params * params, struct lm_ggml_tensor * dst);
95
+ void lm_ggml_compute_forward_add_rel_pos(const struct lm_ggml_compute_params * params, struct lm_ggml_tensor * dst);
96
+ void lm_ggml_compute_forward_rwkv_wkv6(const struct lm_ggml_compute_params * params, struct lm_ggml_tensor * dst);
97
+ void lm_ggml_compute_forward_rwkv_wkv7(const struct lm_ggml_compute_params * params, struct lm_ggml_tensor * dst);
98
+ void lm_ggml_compute_forward_gla(const struct lm_ggml_compute_params * params, struct lm_ggml_tensor * dst);
99
+ void lm_ggml_compute_forward_map_unary(
100
+ const struct lm_ggml_compute_params * params,
101
+ struct lm_ggml_tensor * dst,
102
+ const lm_ggml_unary_op_f32_t fun);
103
+ void lm_ggml_compute_forward_map_binary(
104
+ const struct lm_ggml_compute_params * params,
105
+ struct lm_ggml_tensor * dst,
106
+ const lm_ggml_binary_op_f32_t fun);
107
+ void lm_ggml_compute_forward_map_custom1_f32(
108
+ const struct lm_ggml_compute_params * params,
109
+ struct lm_ggml_tensor * dst,
110
+ const lm_ggml_custom1_op_f32_t fun);
111
+ void lm_ggml_compute_forward_map_custom2_f32(
112
+ const struct lm_ggml_compute_params * params,
113
+ struct lm_ggml_tensor * dst,
114
+ const lm_ggml_custom2_op_f32_t fun);
115
+ void lm_ggml_compute_forward_map_custom3_f32(
116
+ const struct lm_ggml_compute_params * params,
117
+ struct lm_ggml_tensor * dst,
118
+ const lm_ggml_custom3_op_f32_t fun);
119
+ void lm_ggml_compute_forward_map_custom1(const struct lm_ggml_compute_params * params, struct lm_ggml_tensor * dst);
120
+ void lm_ggml_compute_forward_map_custom2(const struct lm_ggml_compute_params * params, struct lm_ggml_tensor * dst);
121
+ void lm_ggml_compute_forward_map_custom3(const struct lm_ggml_compute_params * params, struct lm_ggml_tensor * dst);
122
+ void lm_ggml_compute_forward_cross_entropy_loss(const struct lm_ggml_compute_params * params, struct lm_ggml_tensor * dst);
123
+ void lm_ggml_compute_forward_cross_entropy_loss_back(const struct lm_ggml_compute_params * params, struct lm_ggml_tensor * dst);
124
+ void lm_ggml_compute_forward_opt_step_adamw(const struct lm_ggml_compute_params * params, struct lm_ggml_tensor * dst);
125
+
126
+ #ifdef __cplusplus
127
+ }
128
+ #endif
package/cpp/rn-llama.cpp CHANGED
@@ -191,10 +191,10 @@ bool llama_rn_context::loadModel(common_params &params_)
191
191
  ctx = llama_init.context.get();
192
192
  if (model == nullptr)
193
193
  {
194
- LOG_ERROR("unable to load model: %s", params_.model.c_str());
194
+ LOG_ERROR("unable to load model: %s", params_.model.path.c_str());
195
195
  return false;
196
196
  }
197
- templates = common_chat_templates_from_model(model, params.chat_template);
197
+ templates = common_chat_templates_init(model, params.chat_template);
198
198
  n_ctx = llama_n_ctx(ctx);
199
199
 
200
200
  // We can uncomment for debugging or after this fix: https://github.com/ggerganov/llama.cpp/pull/11101
@@ -219,71 +219,46 @@ common_chat_params llama_rn_context::getFormattedChatWithJinja(
219
219
  const bool &parallel_tool_calls,
220
220
  const std::string &tool_choice
221
221
  ) const {
222
- common_chat_inputs inputs;
223
- inputs.messages = json::parse(messages);
224
- auto useTools = !tools.empty();
225
- if (useTools) {
226
- inputs.tools = json::parse(tools);
227
- }
228
- inputs.parallel_tool_calls = parallel_tool_calls;
229
- if (!tool_choice.empty()) {
230
- inputs.tool_choice = tool_choice;
231
- }
232
- if (!json_schema.empty()) {
233
- inputs.json_schema = json::parse(json_schema);
234
- }
235
- inputs.extract_reasoning = params.reasoning_format != COMMON_REASONING_FORMAT_NONE;
236
- inputs.stream = true;
237
-
238
- // If chat_template is provided, create new one and use it (probably slow)
239
- if (!chat_template.empty()) {
240
- auto tmp = common_chat_templates_from_model(model, chat_template);
241
- const common_chat_template* template_ptr = useTools && tmp.template_tool_use ? tmp.template_tool_use.get() : tmp.template_default.get();
242
- if (inputs.parallel_tool_calls && !template_ptr->original_caps().supports_parallel_tool_calls) {
243
- inputs.parallel_tool_calls = false;
244
- }
245
- return common_chat_params_init(*template_ptr, inputs);
246
- } else {
247
- const common_chat_template* template_ptr = useTools && templates.template_tool_use ? templates.template_tool_use.get() : templates.template_default.get();
248
- if (inputs.parallel_tool_calls && !template_ptr->original_caps().supports_parallel_tool_calls) {
249
- inputs.parallel_tool_calls = false;
250
- }
251
- return common_chat_params_init(*template_ptr, inputs);
252
- }
222
+ common_chat_templates_inputs inputs;
223
+ inputs.use_jinja = true;
224
+ inputs.messages = common_chat_msgs_parse_oaicompat(json::parse(messages));
225
+ auto useTools = !tools.empty();
226
+ if (useTools) {
227
+ inputs.tools = common_chat_tools_parse_oaicompat(json::parse(tools));
228
+ }
229
+ inputs.parallel_tool_calls = parallel_tool_calls;
230
+ if (!tool_choice.empty()) {
231
+ inputs.tool_choice = common_chat_tool_choice_parse_oaicompat(tool_choice);
232
+ }
233
+ if (!json_schema.empty()) {
234
+ inputs.json_schema = json::parse(json_schema);
235
+ }
236
+ inputs.extract_reasoning = params.reasoning_format != COMMON_REASONING_FORMAT_NONE;
237
+
238
+ // If chat_template is provided, create new one and use it (probably slow)
239
+ if (!chat_template.empty()) {
240
+ auto tmps = common_chat_templates_init(model, chat_template);
241
+ return common_chat_templates_apply(tmps.get(), inputs);
242
+ } else {
243
+ return common_chat_templates_apply(templates.get(), inputs);
244
+ }
253
245
  }
254
246
 
255
247
  std::string llama_rn_context::getFormattedChat(
256
248
  const std::string &messages,
257
249
  const std::string &chat_template
258
250
  ) const {
259
- auto chat_json = json::parse(messages);
260
-
261
- // Handle regular chat without tools
262
- std::vector<common_chat_msg> chat_msgs;
263
- for (const auto &msg : chat_json) {
264
- chat_msgs.push_back({
265
- msg["role"].get<std::string>(),
266
- msg["content"].get<std::string>()
267
- });
268
- }
269
-
270
- // If chat_template is provided, create new one and use it (probably slow)
271
- if (!chat_template.empty()) {
272
- auto tmp = common_chat_templates_from_model(model, chat_template);
273
- return common_chat_apply_template(
274
- *tmp.template_default,
275
- chat_msgs,
276
- true,
277
- false
278
- );
279
- } else {
280
- return common_chat_apply_template(
281
- *templates.template_default,
282
- chat_msgs,
283
- true,
284
- false
285
- );
286
- }
251
+ common_chat_templates_inputs inputs;
252
+ inputs.messages = common_chat_msgs_parse_oaicompat(json::parse(messages));
253
+ inputs.use_jinja = false;
254
+
255
+ // If chat_template is provided, create new one and use it (probably slow)
256
+ if (!chat_template.empty()) {
257
+ auto tmps = common_chat_templates_init(model, chat_template);
258
+ return common_chat_templates_apply(tmps.get(), inputs).prompt;
259
+ } else {
260
+ return common_chat_templates_apply(templates.get(), inputs).prompt;
261
+ }
287
262
  }
288
263
 
289
264
  void llama_rn_context::truncatePrompt(std::vector<llama_token> &prompt_tokens) {
@@ -358,7 +333,7 @@ void llama_rn_context::loadPrompt() {
358
333
  }
359
334
 
360
335
  // since #3228 we now have to manually manage the KV cache
361
- llama_kv_cache_seq_rm(ctx, 0, n_past, -1);
336
+ llama_kv_self_seq_rm(ctx, 0, n_past, -1);
362
337
 
363
338
  LOG_VERBOSE("prompt ingested, n_past: %d, cached: %s, to_eval: %s",
364
339
  n_past,
@@ -388,8 +363,8 @@ completion_token_output llama_rn_context::nextToken()
388
363
  const int n_left = n_past - params.n_keep - 1;
389
364
  const int n_discard = n_left/2;
390
365
 
391
- llama_kv_cache_seq_rm (ctx, 0, params.n_keep + 1 , params.n_keep + n_discard + 1);
392
- llama_kv_cache_seq_add(ctx, 0, params.n_keep + 1 + n_discard, n_past, -n_discard);
366
+ llama_kv_self_seq_rm (ctx, 0, params.n_keep + 1 , params.n_keep + n_discard + 1);
367
+ llama_kv_self_seq_add(ctx, 0, params.n_keep + 1 + n_discard, n_past, -n_discard);
393
368
 
394
369
  for (size_t i = params.n_keep + 1 + n_discard; i < embd.size(); i++)
395
370
  {
@@ -595,7 +570,6 @@ std::vector<float> llama_rn_context::getEmbedding(common_params &embd_params)
595
570
  float *data;
596
571
 
597
572
  const enum llama_pooling_type pooling_type = llama_pooling_type(ctx);
598
- printf("pooling_type: %d\n", pooling_type);
599
573
  if (pooling_type == LLAMA_POOLING_TYPE_NONE) {
600
574
  data = llama_get_embeddings(ctx);
601
575
  } else {
@@ -644,7 +618,7 @@ std::string llama_rn_context::bench(int pp, int tg, int pl, int nr)
644
618
  }
645
619
  batch.logits[batch.n_tokens - 1] = 1; // true
646
620
 
647
- llama_kv_cache_clear(ctx);
621
+ llama_kv_self_clear(ctx);
648
622
 
649
623
  const int64_t t_pp_start = llama_time_us();
650
624
  if (llama_decode(ctx, batch) != 0)
@@ -652,7 +626,7 @@ std::string llama_rn_context::bench(int pp, int tg, int pl, int nr)
652
626
  LOG_ERROR("llama_decode() failed during prompt", "");
653
627
  }
654
628
  const int64_t t_pp_end = llama_time_us();
655
- llama_kv_cache_clear(ctx);
629
+ llama_kv_self_clear(ctx);
656
630
 
657
631
  if (is_interrupted) break;
658
632
 
@@ -676,7 +650,7 @@ std::string llama_rn_context::bench(int pp, int tg, int pl, int nr)
676
650
 
677
651
  const int64_t t_tg_end = llama_time_us();
678
652
 
679
- llama_kv_cache_clear(ctx);
653
+ llama_kv_self_clear(ctx);
680
654
 
681
655
  const double t_pp = (t_pp_end - t_pp_start) / 1000000.0;
682
656
  const double t_tg = (t_tg_end - t_tg_start) / 1000000.0;
@@ -702,7 +676,7 @@ std::string llama_rn_context::bench(int pp, int tg, int pl, int nr)
702
676
  tg_std = 0;
703
677
  }
704
678
 
705
- if (is_interrupted) llama_kv_cache_clear(ctx);
679
+ if (is_interrupted) llama_kv_self_clear(ctx);
706
680
  is_predicting = false;
707
681
 
708
682
  char model_desc[128];
@@ -880,8 +854,8 @@ void llama_rn_context::purge_missing_tokens(llama_context * ctx, std::vector<int
880
854
 
881
855
  //extract the unwanted tokens out from context and KV
882
856
  int diff = found - trimstart;
883
- llama_kv_cache_seq_rm(ctx, 0, trimstart, trimstart + diff);
884
- llama_kv_cache_seq_add(ctx, 0, trimstart + diff, -1, -diff);
857
+ llama_kv_self_seq_rm(ctx, 0, trimstart, trimstart + diff);
858
+ llama_kv_self_seq_add(ctx, 0, trimstart + diff, -1, -diff);
885
859
 
886
860
  for (size_t i = trimstart + diff; i < current_context_tokens.size() - 1; i++)
887
861
  {
package/cpp/rn-llama.h CHANGED
@@ -3,8 +3,7 @@
3
3
 
4
4
  #include <sstream>
5
5
  #include <iostream>
6
- #include "chat.hpp"
7
- #include "chat-template.hpp"
6
+ #include "chat.h"
8
7
  #include "common.h"
9
8
  #include "ggml.h"
10
9
  #include "gguf.h"
@@ -17,6 +16,7 @@
17
16
 
18
17
  namespace rnllama {
19
18
 
19
+
20
20
  std::string tokens_to_output_formatted_string(const llama_context *ctx, const llama_token token);
21
21
 
22
22
  std::string tokens_to_str(llama_context *ctx, const std::vector<llama_token>::const_iterator begin, const std::vector<llama_token>::const_iterator end);
@@ -65,7 +65,7 @@ struct llama_rn_context {
65
65
 
66
66
  llama_context *ctx = nullptr;
67
67
  common_sampler *ctx_sampling = nullptr;
68
- common_chat_templates templates;
68
+ common_chat_templates_ptr templates;
69
69
 
70
70
  int n_ctx;
71
71