cui-llama.rn 1.4.4 → 1.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (216) hide show
  1. package/android/src/main/CMakeLists.txt +9 -2
  2. package/android/src/main/jni.cpp +54 -34
  3. package/android/src/main/jniLibs/arm64-v8a/librnllama.so +0 -0
  4. package/android/src/main/jniLibs/arm64-v8a/librnllama_v8.so +0 -0
  5. package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2.so +0 -0
  6. package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2_dotprod.so +0 -0
  7. package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2_dotprod_i8mm.so +0 -0
  8. package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2_i8mm.so +0 -0
  9. package/android/src/main/jniLibs/x86_64/librnllama.so +0 -0
  10. package/android/src/main/jniLibs/x86_64/librnllama_x86_64.so +0 -0
  11. package/cpp/binary-ops.cpp +158 -0
  12. package/cpp/binary-ops.h +16 -0
  13. package/cpp/chat.cpp +1769 -1085
  14. package/cpp/chat.h +143 -0
  15. package/cpp/common.cpp +1562 -1996
  16. package/cpp/common.h +677 -744
  17. package/cpp/cpu-common.h +72 -0
  18. package/cpp/ggml-alloc.c +1039 -1030
  19. package/cpp/ggml-alloc.h +1 -1
  20. package/cpp/ggml-backend-impl.h +255 -255
  21. package/cpp/ggml-backend-reg.cpp +586 -582
  22. package/cpp/ggml-backend.cpp +2004 -2002
  23. package/cpp/ggml-backend.h +354 -354
  24. package/cpp/ggml-common.h +1857 -1851
  25. package/cpp/ggml-cpp.h +39 -39
  26. package/cpp/ggml-cpu-aarch64.cpp +5725 -4247
  27. package/cpp/ggml-cpu-aarch64.h +8 -8
  28. package/cpp/ggml-cpu-impl.h +512 -380
  29. package/cpp/ggml-cpu-quants.c +13026 -11517
  30. package/cpp/ggml-cpu-traits.cpp +36 -36
  31. package/cpp/ggml-cpu-traits.h +38 -38
  32. package/cpp/ggml-cpu.c +3438 -14485
  33. package/cpp/ggml-cpu.cpp +655 -633
  34. package/cpp/ggml-cpu.h +138 -135
  35. package/cpp/ggml-impl.h +594 -567
  36. package/cpp/ggml-metal-impl.h +312 -3
  37. package/cpp/ggml-metal.h +66 -66
  38. package/cpp/ggml-metal.m +5360 -5002
  39. package/cpp/ggml-opt.cpp +854 -854
  40. package/cpp/ggml-opt.h +216 -216
  41. package/cpp/ggml-quants.c +5238 -5238
  42. package/cpp/ggml-threading.h +14 -14
  43. package/cpp/ggml.c +6618 -6524
  44. package/cpp/ggml.h +2222 -2194
  45. package/cpp/gguf.cpp +1330 -1329
  46. package/cpp/gguf.h +202 -202
  47. package/cpp/json-schema-to-grammar.cpp +1024 -1025
  48. package/cpp/json-schema-to-grammar.h +21 -22
  49. package/cpp/json.hpp +24766 -24766
  50. package/cpp/llama-adapter.cpp +382 -347
  51. package/cpp/llama-adapter.h +76 -74
  52. package/cpp/llama-arch.cpp +1714 -1492
  53. package/cpp/llama-arch.h +428 -402
  54. package/cpp/llama-batch.cpp +368 -368
  55. package/cpp/llama-batch.h +88 -88
  56. package/cpp/llama-chat.cpp +640 -587
  57. package/cpp/llama-chat.h +56 -53
  58. package/cpp/llama-context.cpp +2831 -1775
  59. package/cpp/llama-context.h +265 -128
  60. package/cpp/llama-cparams.cpp +1 -1
  61. package/cpp/llama-cparams.h +38 -37
  62. package/cpp/llama-cpp.h +30 -30
  63. package/cpp/llama-grammar.cpp +1219 -1219
  64. package/cpp/llama-grammar.h +173 -164
  65. package/cpp/llama-graph.cpp +1695 -0
  66. package/cpp/llama-graph.h +592 -0
  67. package/cpp/llama-hparams.cpp +79 -71
  68. package/cpp/llama-hparams.h +156 -139
  69. package/cpp/llama-impl.cpp +167 -167
  70. package/cpp/llama-impl.h +61 -61
  71. package/cpp/llama-io.cpp +15 -0
  72. package/cpp/llama-io.h +35 -0
  73. package/cpp/llama-kv-cache.cpp +1380 -718
  74. package/cpp/llama-kv-cache.h +213 -218
  75. package/cpp/llama-memory.cpp +1 -0
  76. package/cpp/llama-memory.h +21 -0
  77. package/cpp/llama-mmap.cpp +600 -590
  78. package/cpp/llama-mmap.h +68 -68
  79. package/cpp/llama-model-loader.cpp +1129 -1124
  80. package/cpp/llama-model-loader.h +169 -167
  81. package/cpp/llama-model.cpp +13080 -4023
  82. package/cpp/llama-model.h +409 -370
  83. package/cpp/llama-sampling.cpp +2563 -2525
  84. package/cpp/llama-sampling.h +32 -32
  85. package/cpp/llama-vocab.cpp +3295 -3252
  86. package/cpp/llama-vocab.h +125 -125
  87. package/cpp/llama.cpp +351 -10137
  88. package/cpp/llama.h +1434 -1340
  89. package/cpp/log.cpp +427 -423
  90. package/cpp/log.h +132 -132
  91. package/cpp/{chat-template.hpp → minja/chat-template.hpp} +537 -529
  92. package/cpp/{minja.hpp → minja/minja.hpp} +2941 -2883
  93. package/cpp/ops.cpp +8723 -0
  94. package/cpp/ops.h +128 -0
  95. package/cpp/rn-llama.cpp +45 -71
  96. package/cpp/rn-llama.h +3 -3
  97. package/cpp/sampling.cpp +573 -532
  98. package/cpp/sgemm.cpp +3043 -2598
  99. package/cpp/sgemm.h +14 -14
  100. package/cpp/simd-mappings.h +888 -0
  101. package/cpp/speculative.cpp +278 -277
  102. package/cpp/speculative.h +28 -28
  103. package/cpp/unary-ops.cpp +186 -0
  104. package/cpp/unary-ops.h +28 -0
  105. package/cpp/vec.cpp +258 -0
  106. package/cpp/vec.h +802 -0
  107. package/ios/CMakeLists.txt +5 -2
  108. package/ios/RNLlama.mm +2 -2
  109. package/ios/RNLlamaContext.mm +40 -24
  110. package/package.json +1 -1
  111. package/src/NativeRNLlama.ts +6 -4
  112. package/src/index.ts +3 -1
  113. package/android/src/main/build-arm64/CMakeCache.txt +0 -429
  114. package/android/src/main/build-arm64/CMakeFiles/3.31.4/CMakeCCompiler.cmake +0 -81
  115. package/android/src/main/build-arm64/CMakeFiles/3.31.4/CMakeCXXCompiler.cmake +0 -101
  116. package/android/src/main/build-arm64/CMakeFiles/3.31.4/CMakeDetermineCompilerABI_C.bin +0 -0
  117. package/android/src/main/build-arm64/CMakeFiles/3.31.4/CMakeDetermineCompilerABI_CXX.bin +0 -0
  118. package/android/src/main/build-arm64/CMakeFiles/3.31.4/CMakeSystem.cmake +0 -15
  119. package/android/src/main/build-arm64/CMakeFiles/3.31.4/CompilerIdC/CMakeCCompilerId.c +0 -904
  120. package/android/src/main/build-arm64/CMakeFiles/3.31.4/CompilerIdC/CMakeCCompilerId.o +0 -0
  121. package/android/src/main/build-arm64/CMakeFiles/3.31.4/CompilerIdCXX/CMakeCXXCompilerId.cpp +0 -919
  122. package/android/src/main/build-arm64/CMakeFiles/3.31.4/CompilerIdCXX/CMakeCXXCompilerId.o +0 -0
  123. package/android/src/main/build-arm64/CMakeFiles/CMakeConfigureLog.yaml +0 -431
  124. package/android/src/main/build-arm64/CMakeFiles/CMakeDirectoryInformation.cmake +0 -16
  125. package/android/src/main/build-arm64/CMakeFiles/Makefile.cmake +0 -165
  126. package/android/src/main/build-arm64/CMakeFiles/Makefile2 +0 -297
  127. package/android/src/main/build-arm64/CMakeFiles/Progress/1 +0 -1
  128. package/android/src/main/build-arm64/CMakeFiles/Progress/2 +0 -1
  129. package/android/src/main/build-arm64/CMakeFiles/Progress/3 +0 -1
  130. package/android/src/main/build-arm64/CMakeFiles/Progress/4 +0 -1
  131. package/android/src/main/build-arm64/CMakeFiles/Progress/5 +0 -1
  132. package/android/src/main/build-arm64/CMakeFiles/Progress/6 +0 -1
  133. package/android/src/main/build-arm64/CMakeFiles/Progress/count.txt +0 -1
  134. package/android/src/main/build-arm64/CMakeFiles/TargetDirectories.txt +0 -8
  135. package/android/src/main/build-arm64/CMakeFiles/cmake.check_cache +0 -1
  136. package/android/src/main/build-arm64/CMakeFiles/progress.marks +0 -1
  137. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-alloc.c.o +0 -0
  138. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-alloc.c.o.d +0 -58
  139. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-backend-reg.cpp.o +0 -0
  140. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-backend-reg.cpp.o.d +0 -756
  141. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-backend.cpp.o +0 -0
  142. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-backend.cpp.o.d +0 -709
  143. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-cpu-aarch64.cpp.o +0 -0
  144. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-cpu-aarch64.cpp.o.d +0 -714
  145. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-cpu-quants.c.o +0 -0
  146. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-cpu-quants.c.o.d +0 -62
  147. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-cpu-traits.cpp.o +0 -0
  148. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-cpu-traits.cpp.o.d +0 -708
  149. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-cpu.c.o +0 -0
  150. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-cpu.c.o.d +0 -113
  151. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-cpu.cpp.o +0 -0
  152. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-cpu.cpp.o.d +0 -713
  153. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-opt.cpp.o +0 -0
  154. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-opt.cpp.o.d +0 -763
  155. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-quants.c.o +0 -0
  156. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-quants.c.o.d +0 -61
  157. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-threading.cpp.o +0 -0
  158. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-threading.cpp.o.d +0 -707
  159. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml.c.o +0 -0
  160. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml.c.o.d +0 -104
  161. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/gguf.cpp.o +0 -0
  162. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/gguf.cpp.o.d +0 -714
  163. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/log.cpp.o +0 -0
  164. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/log.cpp.o.d +0 -723
  165. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/DependInfo.cmake +0 -62
  166. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/build.make +0 -722
  167. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/cmake_clean.cmake +0 -89
  168. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/compiler_depend.make +0 -2
  169. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/compiler_depend.ts +0 -2
  170. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/depend.make +0 -2
  171. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/flags.make +0 -17
  172. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/progress.make +0 -41
  173. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8.dir/DependInfo.cmake +0 -62
  174. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8.dir/build.make +0 -722
  175. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8.dir/cmake_clean.cmake +0 -89
  176. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8.dir/compiler_depend.make +0 -2
  177. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8.dir/compiler_depend.ts +0 -2
  178. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8.dir/depend.make +0 -2
  179. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8.dir/flags.make +0 -17
  180. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8.dir/progress.make +0 -41
  181. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2.dir/DependInfo.cmake +0 -62
  182. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2.dir/build.make +0 -722
  183. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2.dir/cmake_clean.cmake +0 -89
  184. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2.dir/compiler_depend.make +0 -2
  185. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2.dir/compiler_depend.ts +0 -2
  186. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2.dir/depend.make +0 -2
  187. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2.dir/flags.make +0 -17
  188. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2.dir/progress.make +0 -41
  189. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_dotprod.dir/DependInfo.cmake +0 -62
  190. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_dotprod.dir/build.make +0 -722
  191. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_dotprod.dir/cmake_clean.cmake +0 -89
  192. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_dotprod.dir/compiler_depend.make +0 -2
  193. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_dotprod.dir/compiler_depend.ts +0 -2
  194. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_dotprod.dir/depend.make +0 -2
  195. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_dotprod.dir/flags.make +0 -17
  196. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_dotprod.dir/progress.make +0 -41
  197. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_dotprod_i8mm.dir/DependInfo.cmake +0 -62
  198. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_dotprod_i8mm.dir/build.make +0 -722
  199. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_dotprod_i8mm.dir/cmake_clean.cmake +0 -89
  200. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_dotprod_i8mm.dir/compiler_depend.make +0 -2
  201. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_dotprod_i8mm.dir/compiler_depend.ts +0 -2
  202. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_dotprod_i8mm.dir/depend.make +0 -2
  203. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_dotprod_i8mm.dir/flags.make +0 -17
  204. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_dotprod_i8mm.dir/progress.make +0 -41
  205. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_i8mm.dir/DependInfo.cmake +0 -62
  206. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_i8mm.dir/build.make +0 -722
  207. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_i8mm.dir/cmake_clean.cmake +0 -89
  208. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_i8mm.dir/compiler_depend.make +0 -2
  209. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_i8mm.dir/compiler_depend.ts +0 -2
  210. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_i8mm.dir/depend.make +0 -2
  211. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_i8mm.dir/flags.make +0 -17
  212. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_i8mm.dir/progress.make +0 -41
  213. package/android/src/main/build-arm64/Makefile +0 -1862
  214. package/android/src/main/build-arm64/cmake_install.cmake +0 -66
  215. package/cpp/chat.hpp +0 -55
  216. package/cpp/rn-llama.hpp +0 -913
@@ -1,71 +1,79 @@
1
- #include "llama-hparams.h"
2
-
3
- #include "ggml.h"
4
-
5
- uint32_t llama_hparams::n_head(uint32_t il) const {
6
- if (il < n_layer) {
7
- return n_head_arr[il];
8
- }
9
-
10
- LM_GGML_ABORT("fatal error");
11
- }
12
-
13
- uint32_t llama_hparams::n_head_kv(uint32_t il) const {
14
- if (il < n_layer) {
15
- return n_head_kv_arr[il];
16
- }
17
-
18
- LM_GGML_ABORT("fatal error");
19
- }
20
-
21
- uint32_t llama_hparams::n_ff(uint32_t il) const {
22
- if (il < n_layer) {
23
- return n_ff_arr[il];
24
- }
25
-
26
- LM_GGML_ABORT("fatal error");
27
- }
28
-
29
- uint32_t llama_hparams::n_gqa(uint32_t il) const {
30
- const uint32_t n_head = this->n_head(il);
31
- const uint32_t n_head_kv = this->n_head_kv(il);
32
-
33
- if (n_head_kv == 0) {
34
- return 0;
35
- }
36
-
37
- return n_head/n_head_kv;
38
- }
39
-
40
- uint32_t llama_hparams::n_embd_k_gqa(uint32_t il) const {
41
- const uint32_t n_head_kv = this->n_head_kv(il);
42
-
43
- return n_embd_head_k * n_head_kv;
44
- }
45
-
46
- uint32_t llama_hparams::n_embd_v_gqa(uint32_t il) const {
47
- const uint32_t n_head_kv = this->n_head_kv(il);
48
-
49
- return n_embd_head_v * n_head_kv;
50
- }
51
-
52
- uint32_t llama_hparams::n_embd_k_s() const {
53
- if (wkv_head_size != 0) {
54
- // for RWKV models
55
- return token_shift_count * n_embd;
56
- }
57
-
58
- // TODO: maybe support other convolution strides than 1
59
- // NOTE: since the first column of the conv_state is shifted out each time, it's not actually needed
60
- return (ssm_d_conv > 0 ? ssm_d_conv - 1 : 0) * ssm_d_inner;
61
- }
62
-
63
- uint32_t llama_hparams::n_embd_v_s() const {
64
- if (wkv_head_size != 0) {
65
- // corresponds to RWKV's wkv_states size
66
- return n_embd * wkv_head_size;
67
- }
68
-
69
- // corresponds to Mamba's ssm_states size
70
- return ssm_d_state * ssm_d_inner;
71
- }
1
+ #include "llama-hparams.h"
2
+
3
+ #include "ggml.h"
4
+
5
+ uint32_t llama_hparams::n_head(uint32_t il) const {
6
+ if (il < n_layer) {
7
+ return n_head_arr[il];
8
+ }
9
+
10
+ LM_GGML_ABORT("fatal error");
11
+ }
12
+
13
+ uint32_t llama_hparams::n_head_kv(uint32_t il) const {
14
+ if (il < n_layer) {
15
+ return n_head_kv_arr[il];
16
+ }
17
+
18
+ LM_GGML_ABORT("fatal error");
19
+ }
20
+
21
+ uint32_t llama_hparams::n_ff(uint32_t il) const {
22
+ if (il < n_layer) {
23
+ return n_ff_arr[il];
24
+ }
25
+
26
+ LM_GGML_ABORT("fatal error");
27
+ }
28
+
29
+ uint32_t llama_hparams::n_gqa(uint32_t il) const {
30
+ const uint32_t n_head = this->n_head(il);
31
+ const uint32_t n_head_kv = this->n_head_kv(il);
32
+
33
+ if (n_head_kv == 0) {
34
+ return 0;
35
+ }
36
+
37
+ return n_head/n_head_kv;
38
+ }
39
+
40
+ uint32_t llama_hparams::n_embd_k_gqa(uint32_t il) const {
41
+ const uint32_t n_head_kv = this->n_head_kv(il);
42
+
43
+ return n_embd_head_k * n_head_kv;
44
+ }
45
+
46
+ uint32_t llama_hparams::n_embd_v_gqa(uint32_t il) const {
47
+ const uint32_t n_head_kv = this->n_head_kv(il);
48
+
49
+ return n_embd_head_v * n_head_kv;
50
+ }
51
+
52
+ uint32_t llama_hparams::n_embd_k_s() const {
53
+ if (wkv_head_size != 0) {
54
+ // for RWKV models
55
+ return token_shift_count * n_embd;
56
+ }
57
+
58
+ // TODO: maybe support other convolution strides than 1
59
+ // NOTE: since the first column of the conv_state is shifted out each time, it's not actually needed
60
+ return (ssm_d_conv > 0 ? ssm_d_conv - 1 : 0) * ssm_d_inner;
61
+ }
62
+
63
+ uint32_t llama_hparams::n_embd_v_s() const {
64
+ if (wkv_head_size != 0) {
65
+ // corresponds to RWKV's wkv_states size
66
+ return n_embd * wkv_head_size;
67
+ }
68
+
69
+ // corresponds to Mamba's ssm_states size
70
+ return ssm_d_state * ssm_d_inner;
71
+ }
72
+
73
+ bool llama_hparams::is_swa(uint32_t il) const {
74
+ if (il < n_layer) {
75
+ return n_swa > 0 && n_swa_pattern > 0 && il % n_swa_pattern < (n_swa_pattern - 1);
76
+ }
77
+
78
+ LM_GGML_ABORT("fatal error");
79
+ }
@@ -1,139 +1,156 @@
1
- #pragma once
2
-
3
- #include "llama.h"
4
-
5
- #include <array>
6
-
7
- // bump if necessary
8
- #define LLAMA_MAX_LAYERS 512
9
- #define LLAMA_MAX_EXPERTS 256 // DeepSeekV3
10
-
11
- enum llama_expert_gating_func_type {
12
- LLAMA_EXPERT_GATING_FUNC_TYPE_NONE = 0,
13
- LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX = 1,
14
- LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID = 2,
15
- };
16
-
17
- struct llama_hparams_posnet {
18
- uint32_t n_embd;
19
- uint32_t n_layer;
20
- };
21
-
22
- struct llama_hparams_convnext {
23
- uint32_t n_embd;
24
- uint32_t n_layer;
25
- };
26
-
27
- struct llama_hparams {
28
- bool vocab_only;
29
- bool rope_finetuned;
30
- bool use_par_res;
31
- bool swin_norm;
32
-
33
- uint32_t n_ctx_train; // context size the model was trained on
34
- uint32_t n_embd;
35
- uint32_t n_embd_features = 0;
36
- uint32_t n_layer;
37
- uint32_t n_rot;
38
- uint32_t n_swa = 0; // sliding window attention (SWA)
39
- uint32_t n_embd_head_k; // dimension of keys (d_k). d_q is assumed to be the same, but there are n_head q heads, and only n_head_kv k-v heads
40
- uint32_t n_embd_head_v; // dimension of values (d_v) aka n_embd_head
41
- uint32_t n_expert = 0;
42
- uint32_t n_expert_used = 0;
43
- uint32_t n_rel_attn_bkts = 0;
44
-
45
- // for WavTokenizer
46
- struct llama_hparams_posnet posnet;
47
- struct llama_hparams_convnext convnext;
48
-
49
- std::array<uint32_t, LLAMA_MAX_LAYERS> n_head_arr;
50
- std::array<uint32_t, LLAMA_MAX_LAYERS> n_head_kv_arr;
51
- std::array<uint32_t, LLAMA_MAX_LAYERS> n_ff_arr;
52
-
53
- uint32_t n_layer_dense_lead = 0;
54
- uint32_t n_lora_q = 0;
55
- uint32_t n_lora_kv = 0;
56
- uint32_t n_ff_exp = 0;
57
- uint32_t n_ff_shexp = 0;
58
- uint32_t n_expert_shared = 0;
59
- uint32_t n_norm_groups = 0;
60
-
61
- float expert_weights_scale = 0.0;
62
- bool expert_weights_norm = false;
63
- uint32_t expert_gating_func = LLAMA_EXPERT_GATING_FUNC_TYPE_NONE;
64
-
65
- float f_norm_eps;
66
- float f_norm_rms_eps;
67
- float f_norm_group_eps;
68
-
69
- float f_attn_logit_softcapping = 50.0f;
70
- float f_final_logit_softcapping = 30.0f;
71
-
72
- // for RWKV
73
- uint32_t rescale_every_n_layers = 0;
74
- uint32_t time_mix_extra_dim = 0;
75
- uint32_t time_decay_extra_dim = 0;
76
- uint32_t wkv_head_size = 0;
77
- uint32_t token_shift_count = 2;
78
-
79
- float rope_attn_factor = 1.0f;
80
- float rope_freq_base_train;
81
- float rope_freq_scale_train;
82
- uint32_t n_ctx_orig_yarn;
83
- float rope_yarn_log_mul;
84
-
85
- std::array<int, 4> rope_sections;
86
-
87
- // for State Space Models
88
- uint32_t ssm_d_conv = 0;
89
- uint32_t ssm_d_inner = 0;
90
- uint32_t ssm_d_state = 0;
91
- uint32_t ssm_dt_rank = 0;
92
-
93
- bool ssm_dt_b_c_rms = false;
94
-
95
- float f_clamp_kqv = 0.0f;
96
- float f_max_alibi_bias = 0.0f;
97
- float f_logit_scale = 0.0f;
98
-
99
- // Additional scale factors (Granite/Granite MoE)
100
- float f_residual_scale = 0.0f;
101
- float f_embedding_scale = 0.0f;
102
- float f_attention_scale = 0.0f;
103
-
104
- bool causal_attn = true;
105
- bool use_alibi = false;
106
- bool attn_soft_cap = false;
107
-
108
- // needed by encoder-decoder models (e.g. T5, FLAN-T5)
109
- // ref: https://github.com/ggerganov/llama.cpp/pull/8141
110
- llama_token dec_start_token_id = LLAMA_TOKEN_NULL;
111
-
112
- enum llama_pooling_type pooling_type = LLAMA_POOLING_TYPE_NONE;
113
- enum llama_rope_type rope_type = LLAMA_ROPE_TYPE_NONE;
114
- enum llama_rope_scaling_type rope_scaling_type_train = LLAMA_ROPE_SCALING_TYPE_NONE;
115
-
116
- uint32_t n_head(uint32_t il = 0) const;
117
-
118
- uint32_t n_head_kv(uint32_t il = 0) const;
119
-
120
- uint32_t n_ff(uint32_t il = 0) const;
121
-
122
- uint32_t n_gqa(uint32_t il = 0) const;
123
-
124
- // dimension of key embeddings across all k-v heads
125
- uint32_t n_embd_k_gqa(uint32_t il = 0) const;
126
-
127
- // dimension of value embeddings across all k-v heads
128
- uint32_t n_embd_v_gqa(uint32_t il = 0) const;
129
-
130
- // dimension of the rolling state embeddings
131
- // corresponds to Mamba's conv_states size or RWKV's token_shift states size
132
- uint32_t n_embd_k_s() const;
133
-
134
- // dimension of the recurrent state embeddings
135
- uint32_t n_embd_v_s() const;
136
- };
137
-
138
- static_assert(std::is_trivially_copyable<llama_hparams>::value, "llama_hparams must be trivially copyable");
139
-
1
+ #pragma once
2
+
3
+ #include "llama.h"
4
+
5
+ #include <array>
6
+
7
+ // bump if necessary
8
+ #define LLAMA_MAX_LAYERS 512
9
+ #define LLAMA_MAX_EXPERTS 256 // DeepSeekV3
10
+
11
+ enum llama_expert_gating_func_type {
12
+ LLAMA_EXPERT_GATING_FUNC_TYPE_NONE = 0,
13
+ LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX = 1,
14
+ LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID = 2,
15
+ };
16
+
17
+ struct llama_hparams_posnet {
18
+ uint32_t n_embd;
19
+ uint32_t n_layer;
20
+ };
21
+
22
+ struct llama_hparams_convnext {
23
+ uint32_t n_embd;
24
+ uint32_t n_layer;
25
+ };
26
+
27
+ struct llama_hparams {
28
+ bool vocab_only;
29
+ bool rope_finetuned;
30
+ bool use_par_res;
31
+ bool swin_norm;
32
+
33
+ uint32_t n_ctx_train; // context size the model was trained on
34
+ uint32_t n_embd;
35
+ uint32_t n_embd_features = 0;
36
+ uint32_t n_layer;
37
+ uint32_t n_rot;
38
+ uint32_t n_swa = 0; // sliding window attention (SWA)
39
+ uint32_t n_swa_pattern = 1; // by default, all layers use non-sliding-window attention
40
+ uint32_t n_embd_head_k; // dimension of keys (d_k). d_q is assumed to be the same, but there are n_head q heads, and only n_head_kv k-v heads
41
+ uint32_t n_embd_head_v; // dimension of values (d_v) aka n_embd_head
42
+ uint32_t n_expert = 0;
43
+ uint32_t n_expert_used = 0;
44
+ uint32_t n_rel_attn_bkts = 0;
45
+
46
+ // for WavTokenizer
47
+ struct llama_hparams_posnet posnet;
48
+ struct llama_hparams_convnext convnext;
49
+
50
+ std::array<uint32_t, LLAMA_MAX_LAYERS> n_head_arr;
51
+ std::array<uint32_t, LLAMA_MAX_LAYERS> n_head_kv_arr;
52
+ std::array<uint32_t, LLAMA_MAX_LAYERS> n_ff_arr;
53
+
54
+ uint32_t n_layer_dense_lead = 0;
55
+ uint32_t n_lora_q = 0;
56
+ uint32_t n_lora_kv = 0;
57
+ uint32_t n_ff_exp = 0;
58
+ uint32_t n_ff_shexp = 0;
59
+ uint32_t n_expert_shared = 0;
60
+ uint32_t n_norm_groups = 0;
61
+
62
+ float expert_weights_scale = 0.0;
63
+ bool expert_weights_norm = false;
64
+ uint32_t expert_gating_func = LLAMA_EXPERT_GATING_FUNC_TYPE_NONE;
65
+
66
+ float f_norm_eps;
67
+ float f_norm_rms_eps;
68
+ float f_norm_group_eps;
69
+
70
+ float f_attn_logit_softcapping = 50.0f;
71
+ float f_final_logit_softcapping = 30.0f;
72
+
73
+ // for RWKV
74
+ uint32_t rescale_every_n_layers = 0;
75
+ uint32_t time_mix_extra_dim = 0;
76
+ uint32_t time_decay_extra_dim = 0;
77
+ uint32_t wkv_head_size = 0;
78
+ uint32_t token_shift_count = 2;
79
+ uint32_t n_lora_decay = 0;
80
+ uint32_t n_lora_iclr = 0;
81
+ uint32_t n_lora_value_res_mix = 0;
82
+ uint32_t n_lora_gate = 0;
83
+
84
+ float rope_attn_factor = 1.0f;
85
+ float rope_freq_base_train;
86
+ float rope_freq_base_train_swa;
87
+ float rope_freq_scale_train;
88
+ float rope_freq_scale_train_swa;
89
+ uint32_t n_ctx_orig_yarn;
90
+ float rope_yarn_log_mul;
91
+
92
+ std::array<int, 4> rope_sections;
93
+
94
+ // for State Space Models
95
+ uint32_t ssm_d_conv = 0;
96
+ uint32_t ssm_d_inner = 0;
97
+ uint32_t ssm_d_state = 0;
98
+ uint32_t ssm_dt_rank = 0;
99
+
100
+ bool ssm_dt_b_c_rms = false;
101
+
102
+ float f_clamp_kqv = 0.0f;
103
+ float f_max_alibi_bias = 0.0f;
104
+ float f_logit_scale = 0.0f;
105
+
106
+ // Additional scale factors (Granite/Granite MoE)
107
+ float f_residual_scale = 0.0f;
108
+ float f_embedding_scale = 0.0f;
109
+ float f_attention_scale = 0.0f;
110
+
111
+ bool causal_attn = true;
112
+ bool use_alibi = false;
113
+ bool attn_soft_cap = false;
114
+
115
+ uint32_t n_moe_layer_step = 0;
116
+ bool use_kq_norm = true;
117
+ uint32_t n_attn_chunk = 0;
118
+ // values below seems to be fixed on llama4
119
+ uint32_t n_no_rope_layer_step = 4;
120
+ uint32_t n_attn_temp_floor_scale = 8192;
121
+ float f_attn_temp_scale = 0.1;
122
+
123
+ // needed by encoder-decoder models (e.g. T5, FLAN-T5)
124
+ // ref: https://github.com/ggerganov/llama.cpp/pull/8141
125
+ llama_token dec_start_token_id = LLAMA_TOKEN_NULL;
126
+
127
+ enum llama_pooling_type pooling_type = LLAMA_POOLING_TYPE_NONE;
128
+ enum llama_rope_type rope_type = LLAMA_ROPE_TYPE_NONE;
129
+ enum llama_rope_scaling_type rope_scaling_type_train = LLAMA_ROPE_SCALING_TYPE_NONE;
130
+
131
+ uint32_t n_head(uint32_t il = 0) const;
132
+
133
+ uint32_t n_head_kv(uint32_t il = 0) const;
134
+
135
+ uint32_t n_ff(uint32_t il = 0) const;
136
+
137
+ uint32_t n_gqa(uint32_t il = 0) const;
138
+
139
+ // dimension of key embeddings across all k-v heads
140
+ uint32_t n_embd_k_gqa(uint32_t il = 0) const;
141
+
142
+ // dimension of value embeddings across all k-v heads
143
+ uint32_t n_embd_v_gqa(uint32_t il = 0) const;
144
+
145
+ // dimension of the rolling state embeddings
146
+ // corresponds to Mamba's conv_states size or RWKV's token_shift states size
147
+ uint32_t n_embd_k_s() const;
148
+
149
+ // dimension of the recurrent state embeddings
150
+ uint32_t n_embd_v_s() const;
151
+
152
+ bool is_swa(uint32_t il) const;
153
+ };
154
+
155
+ static_assert(std::is_trivially_copyable<llama_hparams>::value, "llama_hparams must be trivially copyable");
156
+