cui-llama.rn 1.4.4 → 1.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (216) hide show
  1. package/android/src/main/CMakeLists.txt +9 -2
  2. package/android/src/main/jni.cpp +54 -34
  3. package/android/src/main/jniLibs/arm64-v8a/librnllama.so +0 -0
  4. package/android/src/main/jniLibs/arm64-v8a/librnllama_v8.so +0 -0
  5. package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2.so +0 -0
  6. package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2_dotprod.so +0 -0
  7. package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2_dotprod_i8mm.so +0 -0
  8. package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2_i8mm.so +0 -0
  9. package/android/src/main/jniLibs/x86_64/librnllama.so +0 -0
  10. package/android/src/main/jniLibs/x86_64/librnllama_x86_64.so +0 -0
  11. package/cpp/binary-ops.cpp +158 -0
  12. package/cpp/binary-ops.h +16 -0
  13. package/cpp/chat.cpp +1769 -1085
  14. package/cpp/chat.h +143 -0
  15. package/cpp/common.cpp +1562 -1996
  16. package/cpp/common.h +677 -744
  17. package/cpp/cpu-common.h +72 -0
  18. package/cpp/ggml-alloc.c +1039 -1030
  19. package/cpp/ggml-alloc.h +1 -1
  20. package/cpp/ggml-backend-impl.h +255 -255
  21. package/cpp/ggml-backend-reg.cpp +586 -582
  22. package/cpp/ggml-backend.cpp +2004 -2002
  23. package/cpp/ggml-backend.h +354 -354
  24. package/cpp/ggml-common.h +1857 -1851
  25. package/cpp/ggml-cpp.h +39 -39
  26. package/cpp/ggml-cpu-aarch64.cpp +5725 -4247
  27. package/cpp/ggml-cpu-aarch64.h +8 -8
  28. package/cpp/ggml-cpu-impl.h +512 -380
  29. package/cpp/ggml-cpu-quants.c +13026 -11517
  30. package/cpp/ggml-cpu-traits.cpp +36 -36
  31. package/cpp/ggml-cpu-traits.h +38 -38
  32. package/cpp/ggml-cpu.c +3438 -14485
  33. package/cpp/ggml-cpu.cpp +655 -633
  34. package/cpp/ggml-cpu.h +138 -135
  35. package/cpp/ggml-impl.h +594 -567
  36. package/cpp/ggml-metal-impl.h +312 -3
  37. package/cpp/ggml-metal.h +66 -66
  38. package/cpp/ggml-metal.m +5360 -5002
  39. package/cpp/ggml-opt.cpp +854 -854
  40. package/cpp/ggml-opt.h +216 -216
  41. package/cpp/ggml-quants.c +5238 -5238
  42. package/cpp/ggml-threading.h +14 -14
  43. package/cpp/ggml.c +6618 -6524
  44. package/cpp/ggml.h +2222 -2194
  45. package/cpp/gguf.cpp +1330 -1329
  46. package/cpp/gguf.h +202 -202
  47. package/cpp/json-schema-to-grammar.cpp +1024 -1025
  48. package/cpp/json-schema-to-grammar.h +21 -22
  49. package/cpp/json.hpp +24766 -24766
  50. package/cpp/llama-adapter.cpp +382 -347
  51. package/cpp/llama-adapter.h +76 -74
  52. package/cpp/llama-arch.cpp +1714 -1492
  53. package/cpp/llama-arch.h +428 -402
  54. package/cpp/llama-batch.cpp +368 -368
  55. package/cpp/llama-batch.h +88 -88
  56. package/cpp/llama-chat.cpp +640 -587
  57. package/cpp/llama-chat.h +56 -53
  58. package/cpp/llama-context.cpp +2831 -1775
  59. package/cpp/llama-context.h +265 -128
  60. package/cpp/llama-cparams.cpp +1 -1
  61. package/cpp/llama-cparams.h +38 -37
  62. package/cpp/llama-cpp.h +30 -30
  63. package/cpp/llama-grammar.cpp +1219 -1219
  64. package/cpp/llama-grammar.h +173 -164
  65. package/cpp/llama-graph.cpp +1695 -0
  66. package/cpp/llama-graph.h +592 -0
  67. package/cpp/llama-hparams.cpp +79 -71
  68. package/cpp/llama-hparams.h +156 -139
  69. package/cpp/llama-impl.cpp +167 -167
  70. package/cpp/llama-impl.h +61 -61
  71. package/cpp/llama-io.cpp +15 -0
  72. package/cpp/llama-io.h +35 -0
  73. package/cpp/llama-kv-cache.cpp +1380 -718
  74. package/cpp/llama-kv-cache.h +213 -218
  75. package/cpp/llama-memory.cpp +1 -0
  76. package/cpp/llama-memory.h +21 -0
  77. package/cpp/llama-mmap.cpp +600 -590
  78. package/cpp/llama-mmap.h +68 -68
  79. package/cpp/llama-model-loader.cpp +1129 -1124
  80. package/cpp/llama-model-loader.h +169 -167
  81. package/cpp/llama-model.cpp +13080 -4023
  82. package/cpp/llama-model.h +409 -370
  83. package/cpp/llama-sampling.cpp +2563 -2525
  84. package/cpp/llama-sampling.h +32 -32
  85. package/cpp/llama-vocab.cpp +3295 -3252
  86. package/cpp/llama-vocab.h +125 -125
  87. package/cpp/llama.cpp +351 -10137
  88. package/cpp/llama.h +1434 -1340
  89. package/cpp/log.cpp +427 -423
  90. package/cpp/log.h +132 -132
  91. package/cpp/{chat-template.hpp → minja/chat-template.hpp} +537 -529
  92. package/cpp/{minja.hpp → minja/minja.hpp} +2941 -2883
  93. package/cpp/ops.cpp +8723 -0
  94. package/cpp/ops.h +128 -0
  95. package/cpp/rn-llama.cpp +45 -71
  96. package/cpp/rn-llama.h +3 -3
  97. package/cpp/sampling.cpp +573 -532
  98. package/cpp/sgemm.cpp +3043 -2598
  99. package/cpp/sgemm.h +14 -14
  100. package/cpp/simd-mappings.h +888 -0
  101. package/cpp/speculative.cpp +278 -277
  102. package/cpp/speculative.h +28 -28
  103. package/cpp/unary-ops.cpp +186 -0
  104. package/cpp/unary-ops.h +28 -0
  105. package/cpp/vec.cpp +258 -0
  106. package/cpp/vec.h +802 -0
  107. package/ios/CMakeLists.txt +5 -2
  108. package/ios/RNLlama.mm +2 -2
  109. package/ios/RNLlamaContext.mm +40 -24
  110. package/package.json +1 -1
  111. package/src/NativeRNLlama.ts +6 -4
  112. package/src/index.ts +3 -1
  113. package/android/src/main/build-arm64/CMakeCache.txt +0 -429
  114. package/android/src/main/build-arm64/CMakeFiles/3.31.4/CMakeCCompiler.cmake +0 -81
  115. package/android/src/main/build-arm64/CMakeFiles/3.31.4/CMakeCXXCompiler.cmake +0 -101
  116. package/android/src/main/build-arm64/CMakeFiles/3.31.4/CMakeDetermineCompilerABI_C.bin +0 -0
  117. package/android/src/main/build-arm64/CMakeFiles/3.31.4/CMakeDetermineCompilerABI_CXX.bin +0 -0
  118. package/android/src/main/build-arm64/CMakeFiles/3.31.4/CMakeSystem.cmake +0 -15
  119. package/android/src/main/build-arm64/CMakeFiles/3.31.4/CompilerIdC/CMakeCCompilerId.c +0 -904
  120. package/android/src/main/build-arm64/CMakeFiles/3.31.4/CompilerIdC/CMakeCCompilerId.o +0 -0
  121. package/android/src/main/build-arm64/CMakeFiles/3.31.4/CompilerIdCXX/CMakeCXXCompilerId.cpp +0 -919
  122. package/android/src/main/build-arm64/CMakeFiles/3.31.4/CompilerIdCXX/CMakeCXXCompilerId.o +0 -0
  123. package/android/src/main/build-arm64/CMakeFiles/CMakeConfigureLog.yaml +0 -431
  124. package/android/src/main/build-arm64/CMakeFiles/CMakeDirectoryInformation.cmake +0 -16
  125. package/android/src/main/build-arm64/CMakeFiles/Makefile.cmake +0 -165
  126. package/android/src/main/build-arm64/CMakeFiles/Makefile2 +0 -297
  127. package/android/src/main/build-arm64/CMakeFiles/Progress/1 +0 -1
  128. package/android/src/main/build-arm64/CMakeFiles/Progress/2 +0 -1
  129. package/android/src/main/build-arm64/CMakeFiles/Progress/3 +0 -1
  130. package/android/src/main/build-arm64/CMakeFiles/Progress/4 +0 -1
  131. package/android/src/main/build-arm64/CMakeFiles/Progress/5 +0 -1
  132. package/android/src/main/build-arm64/CMakeFiles/Progress/6 +0 -1
  133. package/android/src/main/build-arm64/CMakeFiles/Progress/count.txt +0 -1
  134. package/android/src/main/build-arm64/CMakeFiles/TargetDirectories.txt +0 -8
  135. package/android/src/main/build-arm64/CMakeFiles/cmake.check_cache +0 -1
  136. package/android/src/main/build-arm64/CMakeFiles/progress.marks +0 -1
  137. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-alloc.c.o +0 -0
  138. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-alloc.c.o.d +0 -58
  139. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-backend-reg.cpp.o +0 -0
  140. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-backend-reg.cpp.o.d +0 -756
  141. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-backend.cpp.o +0 -0
  142. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-backend.cpp.o.d +0 -709
  143. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-cpu-aarch64.cpp.o +0 -0
  144. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-cpu-aarch64.cpp.o.d +0 -714
  145. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-cpu-quants.c.o +0 -0
  146. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-cpu-quants.c.o.d +0 -62
  147. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-cpu-traits.cpp.o +0 -0
  148. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-cpu-traits.cpp.o.d +0 -708
  149. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-cpu.c.o +0 -0
  150. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-cpu.c.o.d +0 -113
  151. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-cpu.cpp.o +0 -0
  152. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-cpu.cpp.o.d +0 -713
  153. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-opt.cpp.o +0 -0
  154. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-opt.cpp.o.d +0 -763
  155. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-quants.c.o +0 -0
  156. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-quants.c.o.d +0 -61
  157. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-threading.cpp.o +0 -0
  158. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-threading.cpp.o.d +0 -707
  159. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml.c.o +0 -0
  160. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml.c.o.d +0 -104
  161. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/gguf.cpp.o +0 -0
  162. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/gguf.cpp.o.d +0 -714
  163. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/log.cpp.o +0 -0
  164. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/log.cpp.o.d +0 -723
  165. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/DependInfo.cmake +0 -62
  166. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/build.make +0 -722
  167. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/cmake_clean.cmake +0 -89
  168. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/compiler_depend.make +0 -2
  169. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/compiler_depend.ts +0 -2
  170. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/depend.make +0 -2
  171. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/flags.make +0 -17
  172. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/progress.make +0 -41
  173. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8.dir/DependInfo.cmake +0 -62
  174. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8.dir/build.make +0 -722
  175. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8.dir/cmake_clean.cmake +0 -89
  176. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8.dir/compiler_depend.make +0 -2
  177. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8.dir/compiler_depend.ts +0 -2
  178. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8.dir/depend.make +0 -2
  179. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8.dir/flags.make +0 -17
  180. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8.dir/progress.make +0 -41
  181. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2.dir/DependInfo.cmake +0 -62
  182. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2.dir/build.make +0 -722
  183. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2.dir/cmake_clean.cmake +0 -89
  184. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2.dir/compiler_depend.make +0 -2
  185. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2.dir/compiler_depend.ts +0 -2
  186. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2.dir/depend.make +0 -2
  187. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2.dir/flags.make +0 -17
  188. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2.dir/progress.make +0 -41
  189. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_dotprod.dir/DependInfo.cmake +0 -62
  190. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_dotprod.dir/build.make +0 -722
  191. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_dotprod.dir/cmake_clean.cmake +0 -89
  192. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_dotprod.dir/compiler_depend.make +0 -2
  193. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_dotprod.dir/compiler_depend.ts +0 -2
  194. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_dotprod.dir/depend.make +0 -2
  195. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_dotprod.dir/flags.make +0 -17
  196. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_dotprod.dir/progress.make +0 -41
  197. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_dotprod_i8mm.dir/DependInfo.cmake +0 -62
  198. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_dotprod_i8mm.dir/build.make +0 -722
  199. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_dotprod_i8mm.dir/cmake_clean.cmake +0 -89
  200. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_dotprod_i8mm.dir/compiler_depend.make +0 -2
  201. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_dotprod_i8mm.dir/compiler_depend.ts +0 -2
  202. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_dotprod_i8mm.dir/depend.make +0 -2
  203. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_dotprod_i8mm.dir/flags.make +0 -17
  204. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_dotprod_i8mm.dir/progress.make +0 -41
  205. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_i8mm.dir/DependInfo.cmake +0 -62
  206. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_i8mm.dir/build.make +0 -722
  207. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_i8mm.dir/cmake_clean.cmake +0 -89
  208. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_i8mm.dir/compiler_depend.make +0 -2
  209. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_i8mm.dir/compiler_depend.ts +0 -2
  210. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_i8mm.dir/depend.make +0 -2
  211. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_i8mm.dir/flags.make +0 -17
  212. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_i8mm.dir/progress.make +0 -41
  213. package/android/src/main/build-arm64/Makefile +0 -1862
  214. package/android/src/main/build-arm64/cmake_install.cmake +0 -66
  215. package/cpp/chat.hpp +0 -55
  216. package/cpp/rn-llama.hpp +0 -913
@@ -1,347 +1,382 @@
1
- #include "llama-adapter.h"
2
-
3
- #include "llama-impl.h"
4
- #include "llama-mmap.h"
5
- #include "llama-model.h"
6
-
7
- #include <algorithm>
8
- #include <map>
9
- #include <cassert>
10
- #include <stdexcept>
11
-
12
- // vec
13
-
14
- struct lm_ggml_tensor * llama_adapter_cvec::tensor_for(int il) const {
15
- if (il < 0 || il < layer_start || il > layer_end || (size_t) il >= tensors.size()) {
16
- return nullptr;
17
- }
18
-
19
- return tensors[il];
20
- }
21
-
22
- struct lm_ggml_tensor * llama_adapter_cvec::apply_to(struct lm_ggml_context * ctx, struct lm_ggml_tensor * cur, int il) const {
23
- lm_ggml_tensor * layer_dir = tensor_for(il);
24
- if (layer_dir != nullptr) {
25
- cur = lm_ggml_add(ctx, cur, layer_dir);
26
- }
27
-
28
- return cur;
29
- }
30
-
31
- bool llama_adapter_cvec::init(const llama_model & model) {
32
- const auto & hparams = model.hparams;
33
-
34
- LM_GGML_ASSERT(tensors.empty());
35
- LM_GGML_ASSERT(ctxs.empty());
36
- LM_GGML_ASSERT(bufs.empty());
37
-
38
- // create a context for each buffer type
39
- std::map<lm_ggml_backend_buffer_type_t, lm_ggml_context *> ctx_map;
40
- auto ctx_for_buft = [&](lm_ggml_backend_buffer_type_t buft) -> lm_ggml_context * {
41
- auto it = ctx_map.find(buft);
42
- if (it == ctx_map.end()) {
43
- struct lm_ggml_init_params params = {
44
- /*.mem_size =*/ hparams.n_layer*lm_ggml_tensor_overhead(),
45
- /*.mem_buffer =*/ NULL,
46
- /*.no_alloc =*/ true,
47
- };
48
-
49
- lm_ggml_context * ctx = lm_ggml_init(params);
50
- if (!ctx) {
51
- return nullptr;
52
- }
53
-
54
- ctx_map[buft] = ctx;
55
- ctxs.emplace_back(ctx);
56
-
57
- return ctx;
58
- }
59
-
60
- return it->second;
61
- };
62
-
63
- // make tensors
64
- tensors.reserve(hparams.n_layer);
65
- tensors.push_back(nullptr); // there's never a tensor for layer 0
66
- for (size_t il = 1; il < hparams.n_layer; il++) {
67
- lm_ggml_backend_buffer_type_t buft = model.select_buft(il);
68
- lm_ggml_context * ctx = ctx_for_buft(buft);
69
- if (!ctx) {
70
- LLAMA_LOG_ERROR("%s: failed to allocate context for control vector\n", __func__);
71
- return false;
72
- }
73
- lm_ggml_tensor * tensor = lm_ggml_new_tensor_1d(ctx, LM_GGML_TYPE_F32, hparams.n_embd);
74
- tensors.push_back(tensor);
75
- }
76
-
77
- // allocate tensors / buffers and zero
78
- bufs.reserve(ctx_map.size());
79
- for (auto it : ctx_map) {
80
- lm_ggml_backend_buffer_type_t buft = it.first;
81
- lm_ggml_context * ctx = it.second;
82
- lm_ggml_backend_buffer_t buf = lm_ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft);
83
- if (!buf) {
84
- LLAMA_LOG_ERROR("%s: failed to allocate buffer for control vector\n", __func__);
85
- return false;
86
- }
87
- lm_ggml_backend_buffer_clear(buf, 0);
88
- bufs.emplace_back(buf);
89
- }
90
-
91
- return true;
92
- }
93
-
94
- int32_t llama_adapter_cvec::apply(
95
- const llama_model & model,
96
- const float * data,
97
- size_t len,
98
- int32_t n_embd,
99
- int32_t il_start,
100
- int32_t il_end) {
101
- const auto & hparams = model.hparams;
102
-
103
- if (data == nullptr) {
104
- // disable the current control vector (but leave allocated for later)
105
- layer_start = -1;
106
- layer_end = -1;
107
- return 0;
108
- }
109
-
110
- if (n_embd != (int) hparams.n_embd) {
111
- LLAMA_LOG_ERROR("%s: control vector n_embd does not match model\n", __func__);
112
- return 1;
113
- }
114
-
115
- if (tensors.empty()) {
116
- if (!init(model)) {
117
- return 1;
118
- }
119
- }
120
-
121
- layer_start = il_start;
122
- layer_end = il_end;
123
-
124
- for (size_t il = 1; il < hparams.n_layer; il++) {
125
- assert(tensors[il] != nullptr);
126
-
127
- const size_t off = n_embd * (il - 1); // buffer doesn't have data for layer 0, since it's never present
128
- if (off + n_embd <= len) {
129
- lm_ggml_backend_tensor_set(tensors[il], data + off, 0, n_embd * lm_ggml_element_size(tensors[il]));
130
- }
131
- }
132
-
133
- return 0;
134
- }
135
-
136
- // lora
137
-
138
- llama_adapter_lora_weight * llama_adapter_lora::get_weight(struct lm_ggml_tensor * w) {
139
- const std::string name(w->name);
140
-
141
- const auto pos = ab_map.find(name);
142
- if (pos != ab_map.end()) {
143
- return &pos->second;
144
- }
145
-
146
- return nullptr;
147
- }
148
-
149
- static void llama_adapter_lora_init_impl(struct llama_model & model, const char * path_lora, struct llama_adapter_lora & adapter) {
150
- LLAMA_LOG_INFO("%s: loading lora adapter from '%s' ...\n", __func__, path_lora);
151
-
152
- lm_ggml_context * ctx_init;
153
- struct lm_gguf_init_params meta_lm_gguf_params = {
154
- /* .no_alloc = */ true,
155
- /* .ctx = */ &ctx_init,
156
- };
157
-
158
- lm_gguf_context_ptr ctx_gguf { lm_gguf_init_from_file(path_lora, meta_lm_gguf_params) };
159
- if (!ctx_gguf) {
160
- throw std::runtime_error("failed to load lora adapter file from " + std::string(path_lora));
161
- }
162
-
163
- lm_ggml_context_ptr ctx { ctx_init };
164
-
165
- // check metadata
166
- {
167
- auto get_kv_str = [&](const std::string & key) -> std::string {
168
- int id = lm_gguf_find_key(ctx_gguf.get(), key.c_str());
169
- return id < 0 ? "" : std::string(lm_gguf_get_val_str(ctx_gguf.get(), id));
170
- };
171
- auto get_kv_f32 = [&](const std::string & key) -> float {
172
- int id = lm_gguf_find_key(ctx_gguf.get(), key.c_str());
173
- return id < 0 ? 0.0f : lm_gguf_get_val_f32(ctx_gguf.get(), id);
174
- };
175
- LLM_KV llm_kv = LLM_KV(LLM_ARCH_UNKNOWN);
176
-
177
- auto general_type = get_kv_str(llm_kv(LLM_KV_GENERAL_TYPE));
178
- if (general_type != "adapter") {
179
- throw std::runtime_error("expect general.type to be 'adapter', but got: " + general_type);
180
- }
181
-
182
- auto general_arch_str = get_kv_str(llm_kv(LLM_KV_GENERAL_ARCHITECTURE));
183
- auto general_arch = llm_arch_from_string(general_arch_str);
184
- if (general_arch != model.arch) {
185
- throw std::runtime_error("model arch and LoRA arch mismatch");
186
- }
187
-
188
- auto adapter_type = get_kv_str(llm_kv(LLM_KV_ADAPTER_TYPE));
189
- if (adapter_type != "lora") {
190
- throw std::runtime_error("expect adapter.type to be 'lora', but got: " + adapter_type);
191
- }
192
-
193
- adapter.alpha = get_kv_f32(llm_kv(LLM_KV_ADAPTER_LORA_ALPHA));
194
- }
195
-
196
- int n_tensors = lm_gguf_get_n_tensors(ctx_gguf.get());
197
-
198
- // contexts for each buffer type
199
- std::map<lm_ggml_backend_buffer_type_t, lm_ggml_context *> ctx_map;
200
- auto ctx_for_buft = [&](lm_ggml_backend_buffer_type_t buft) -> lm_ggml_context * {
201
- auto it = ctx_map.find(buft);
202
- if (it == ctx_map.end()) {
203
- // add a new context
204
- struct lm_ggml_init_params params = {
205
- /*.mem_size =*/ n_tensors*lm_ggml_tensor_overhead(),
206
- /*.mem_buffer =*/ NULL,
207
- /*.no_alloc =*/ true,
208
- };
209
- lm_ggml_context * buft_ctx = lm_ggml_init(params);
210
- if (!buft_ctx) {
211
- return nullptr;
212
- }
213
- ctx_map[buft] = buft_ctx;
214
- adapter.ctxs.emplace_back(buft_ctx);
215
- return buft_ctx;
216
- };
217
- return it->second;
218
- };
219
-
220
- // bundle lora_a and lora_b into pairs
221
- std::map<std::string, llama_adapter_lora_weight> ab_map;
222
- auto str_endswith = [](const std::string & str, const std::string & suffix) {
223
- return str.size() >= suffix.size() && str.compare(str.size()-suffix.size(), suffix.size(), suffix) == 0;
224
- };
225
-
226
- for (lm_ggml_tensor * cur = lm_ggml_get_first_tensor(ctx.get()); cur; cur = lm_ggml_get_next_tensor(ctx.get(), cur)) {
227
- std::string name(cur->name);
228
- if (str_endswith(name, ".lora_a")) {
229
- replace_all(name, ".lora_a", "");
230
- if (ab_map.find(name) == ab_map.end()) {
231
- ab_map[name] = llama_adapter_lora_weight(cur, nullptr);
232
- } else {
233
- ab_map[name].a = cur;
234
- }
235
- } else if (str_endswith(name, ".lora_b")) {
236
- replace_all(name, ".lora_b", "");
237
- if (ab_map.find(name) == ab_map.end()) {
238
- ab_map[name] = llama_adapter_lora_weight(nullptr, cur);
239
- } else {
240
- ab_map[name].b = cur;
241
- }
242
- } else if (str_endswith(name, "_norm.weight")) {
243
- // TODO: add support for norm vector
244
- // for now, we don't really care because most adapters still work fine without it
245
- continue;
246
- } else {
247
- throw std::runtime_error("LoRA tensor '" + name + "' has unexpected suffix");
248
- }
249
- }
250
-
251
- // add tensors
252
- for (auto & it : ab_map) {
253
- const std::string & name = it.first;
254
- llama_adapter_lora_weight & w = it.second;
255
- bool is_token_embd = str_endswith(name, "token_embd.weight");
256
-
257
- if (!w.a || !w.b) {
258
- throw std::runtime_error("LoRA tensor pair for '" + name + "' is missing one component");
259
- }
260
-
261
- // device buft and device ctx
262
- const auto * model_tensor = model.get_tensor(name.c_str());
263
- if (!model_tensor) {
264
- throw std::runtime_error("LoRA tensor '" + name + "' does not exist in base model (hint: maybe wrong base model?)");
265
- }
266
-
267
- struct lm_ggml_context * dev_ctx = ctx_for_buft(lm_ggml_backend_buffer_get_type(model_tensor->buffer));
268
- // validate tensor shape
269
- if (is_token_embd) {
270
- // expect B to be non-transposed, A and B are flipped; see llm_build_inp_embd()
271
- if (model_tensor->ne[0] != w.b->ne[1] || model_tensor->ne[1] != w.a->ne[1]) {
272
- throw std::runtime_error("tensor '" + name + "' has incorrect shape (hint: maybe wrong base model?)");
273
- }
274
- } else {
275
- if (model_tensor->ne[0] != w.a->ne[0] || model_tensor->ne[1] != w.b->ne[1]) {
276
- throw std::runtime_error("tensor '" + name + "' has incorrect shape (hint: maybe wrong base model?)");
277
- }
278
- if (w.a->ne[1] != w.b->ne[0]) {
279
- throw std::runtime_error("lora_a tensor is not transposed (hint: adapter from \"finetune\" example is no longer supported)");
280
- }
281
- }
282
-
283
- // save tensor to adapter
284
- struct lm_ggml_tensor * tensor_a = lm_ggml_dup_tensor(dev_ctx, w.a);
285
- struct lm_ggml_tensor * tensor_b = lm_ggml_dup_tensor(dev_ctx, w.b);
286
- lm_ggml_set_name(tensor_a, w.a->name);
287
- lm_ggml_set_name(tensor_b, w.b->name);
288
- adapter.ab_map[name] = llama_adapter_lora_weight(tensor_a, tensor_b);
289
- }
290
-
291
- // allocate tensors / buffers and zero
292
- {
293
- adapter.ctxs.reserve(ctx_map.size());
294
- adapter.bufs.reserve(ctx_map.size());
295
- for (auto & it : ctx_map) {
296
- lm_ggml_backend_buffer_type_t buft = it.first;
297
- lm_ggml_context * ctx_dev = it.second;
298
- lm_ggml_backend_buffer_ptr buf { lm_ggml_backend_alloc_ctx_tensors_from_buft(ctx_dev, buft) };
299
- if (!buf) {
300
- throw std::runtime_error("failed to allocate buffer for lora adapter\n");
301
- }
302
- LLAMA_LOG_INFO("%s: %10s LoRA buffer size = %8.2f MiB\n", __func__, lm_ggml_backend_buffer_name(buf.get()), lm_ggml_backend_buffer_get_size(buf.get())/1024.0/1024.0);
303
- adapter.bufs.emplace_back(std::move(buf));
304
- }
305
- }
306
-
307
- // set tensor data
308
- {
309
- llama_file lm_gguf_file(path_lora, "rb");
310
- std::vector<uint8_t> read_buf;
311
- auto set_tensor = [&](struct lm_ggml_tensor * orig, struct lm_ggml_tensor * dev) {
312
- size_t offs = lm_gguf_get_data_offset(ctx_gguf.get()) + lm_gguf_get_tensor_offset(ctx_gguf.get(), lm_gguf_find_tensor(ctx_gguf.get(), orig->name));
313
- size_t size = lm_ggml_nbytes(orig);
314
- read_buf.resize(size);
315
- lm_gguf_file.seek(offs, SEEK_SET);
316
- lm_gguf_file.read_raw(read_buf.data(), size);
317
- lm_ggml_backend_tensor_set(dev, read_buf.data(), 0, size);
318
- };
319
- for (auto & it : adapter.ab_map) {
320
- auto orig = ab_map[it.first];
321
- auto dev = it.second;
322
- set_tensor(orig.a, dev.a);
323
- set_tensor(orig.b, dev.b);
324
- }
325
- }
326
-
327
- LLAMA_LOG_INFO("%s: loaded %zu tensors from lora file\n", __func__, adapter.ab_map.size()*2);
328
- }
329
-
330
- struct llama_adapter_lora * llama_adapter_lora_init(struct llama_model * model, const char * path_lora) {
331
- struct llama_adapter_lora * adapter = new llama_adapter_lora();
332
-
333
- try {
334
- llama_adapter_lora_init_impl(*model, path_lora, *adapter);
335
- return adapter;
336
- } catch (const std::exception & err) {
337
- LLAMA_LOG_ERROR("%s: failed to apply lora adapter: %s\n", __func__, err.what());
338
-
339
- delete adapter;
340
- }
341
-
342
- return nullptr;
343
- }
344
-
345
- void llama_adapter_lora_free(struct llama_adapter_lora * adapter) {
346
- delete adapter;
347
- }
1
+ #include "llama-adapter.h"
2
+
3
+ #include "llama-impl.h"
4
+ #include "llama-mmap.h"
5
+ #include "llama-model.h"
6
+
7
+ #include <map>
8
+ #include <cassert>
9
+ #include <stdexcept>
10
+
11
+ // vec
12
+
13
+ lm_ggml_tensor * llama_adapter_cvec::tensor_for(int il) const {
14
+ if (il < 0 || il < layer_start || il > layer_end || (size_t) il >= tensors.size()) {
15
+ return nullptr;
16
+ }
17
+
18
+ return tensors[il];
19
+ }
20
+
21
+ lm_ggml_tensor * llama_adapter_cvec::apply_to(lm_ggml_context * ctx, lm_ggml_tensor * cur, int il) const {
22
+ lm_ggml_tensor * layer_dir = tensor_for(il);
23
+ if (layer_dir != nullptr) {
24
+ cur = lm_ggml_add(ctx, cur, layer_dir);
25
+ }
26
+
27
+ return cur;
28
+ }
29
+
30
+ bool llama_adapter_cvec::init(const llama_model & model) {
31
+ const auto & hparams = model.hparams;
32
+
33
+ LM_GGML_ASSERT(tensors.empty());
34
+ LM_GGML_ASSERT(ctxs.empty());
35
+ LM_GGML_ASSERT(bufs.empty());
36
+
37
+ // create a context for each buffer type
38
+ std::map<lm_ggml_backend_buffer_type_t, lm_ggml_context *> ctx_map;
39
+ auto ctx_for_buft = [&](lm_ggml_backend_buffer_type_t buft) -> lm_ggml_context * {
40
+ auto it = ctx_map.find(buft);
41
+ if (it == ctx_map.end()) {
42
+ lm_ggml_init_params params = {
43
+ /*.mem_size =*/ hparams.n_layer*lm_ggml_tensor_overhead(),
44
+ /*.mem_buffer =*/ NULL,
45
+ /*.no_alloc =*/ true,
46
+ };
47
+
48
+ lm_ggml_context * ctx = lm_ggml_init(params);
49
+ if (!ctx) {
50
+ return nullptr;
51
+ }
52
+
53
+ ctx_map[buft] = ctx;
54
+ ctxs.emplace_back(ctx);
55
+
56
+ return ctx;
57
+ }
58
+
59
+ return it->second;
60
+ };
61
+
62
+ // make tensors
63
+ tensors.reserve(hparams.n_layer);
64
+ tensors.push_back(nullptr); // there's never a tensor for layer 0
65
+ for (size_t il = 1; il < hparams.n_layer; il++) {
66
+ lm_ggml_backend_buffer_type_t buft = model.select_buft(il);
67
+ lm_ggml_context * ctx = ctx_for_buft(buft);
68
+ if (!ctx) {
69
+ LLAMA_LOG_ERROR("%s: failed to allocate context for control vector\n", __func__);
70
+ return false;
71
+ }
72
+ lm_ggml_tensor * tensor = lm_ggml_new_tensor_1d(ctx, LM_GGML_TYPE_F32, hparams.n_embd);
73
+ tensors.push_back(tensor);
74
+ }
75
+
76
+ // allocate tensors / buffers and zero
77
+ bufs.reserve(ctx_map.size());
78
+ for (auto it : ctx_map) {
79
+ lm_ggml_backend_buffer_type_t buft = it.first;
80
+ lm_ggml_context * ctx = it.second;
81
+ lm_ggml_backend_buffer_t buf = lm_ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft);
82
+ if (!buf) {
83
+ LLAMA_LOG_ERROR("%s: failed to allocate buffer for control vector\n", __func__);
84
+ return false;
85
+ }
86
+ lm_ggml_backend_buffer_clear(buf, 0);
87
+ bufs.emplace_back(buf);
88
+ }
89
+
90
+ return true;
91
+ }
92
+
93
+ bool llama_adapter_cvec::apply(
94
+ const llama_model & model,
95
+ const float * data,
96
+ size_t len,
97
+ int32_t n_embd,
98
+ int32_t il_start,
99
+ int32_t il_end) {
100
+ const auto & hparams = model.hparams;
101
+
102
+ if (data == nullptr) {
103
+ // disable the current control vector (but leave allocated for later)
104
+ layer_start = -1;
105
+ layer_end = -1;
106
+ return true;
107
+ }
108
+
109
+ if (n_embd != (int) hparams.n_embd) {
110
+ LLAMA_LOG_ERROR("%s: control vector n_embd does not match model\n", __func__);
111
+ return false;
112
+ }
113
+
114
+ if (tensors.empty()) {
115
+ if (!init(model)) {
116
+ return false;
117
+ }
118
+ }
119
+
120
+ layer_start = il_start;
121
+ layer_end = il_end;
122
+
123
+ for (size_t il = 1; il < hparams.n_layer; il++) {
124
+ assert(tensors[il] != nullptr);
125
+
126
+ const size_t off = n_embd * (il - 1); // buffer doesn't have data for layer 0, since it's never present
127
+ if (off + n_embd <= len) {
128
+ lm_ggml_backend_tensor_set(tensors[il], data + off, 0, n_embd * lm_ggml_element_size(tensors[il]));
129
+ }
130
+ }
131
+
132
+ return true;
133
+ }
134
+
135
+ // lora
136
+
137
+ llama_adapter_lora_weight * llama_adapter_lora::get_weight(lm_ggml_tensor * w) {
138
+ const std::string name(w->name);
139
+
140
+ const auto pos = ab_map.find(name);
141
+ if (pos != ab_map.end()) {
142
+ return &pos->second;
143
+ }
144
+
145
+ return nullptr;
146
+ }
147
+
148
+ static void llama_adapter_lora_init_impl(llama_model & model, const char * path_lora, llama_adapter_lora & adapter) {
149
+ LLAMA_LOG_INFO("%s: loading lora adapter from '%s' ...\n", __func__, path_lora);
150
+
151
+ lm_ggml_context * ctx_init;
152
+ lm_gguf_init_params meta_lm_gguf_params = {
153
+ /* .no_alloc = */ true,
154
+ /* .ctx = */ &ctx_init,
155
+ };
156
+
157
+ lm_gguf_context_ptr ctx_gguf { lm_gguf_init_from_file(path_lora, meta_lm_gguf_params) };
158
+ if (!ctx_gguf) {
159
+ throw std::runtime_error("failed to load lora adapter file from " + std::string(path_lora));
160
+ }
161
+
162
+ lm_ggml_context_ptr ctx { ctx_init };
163
+
164
+ // check metadata
165
+ {
166
+ auto get_kv_str = [&](const std::string & key) -> std::string {
167
+ int id = lm_gguf_find_key(ctx_gguf.get(), key.c_str());
168
+ return id < 0 ? "" : std::string(lm_gguf_get_val_str(ctx_gguf.get(), id));
169
+ };
170
+ auto get_kv_f32 = [&](const std::string & key) -> float {
171
+ int id = lm_gguf_find_key(ctx_gguf.get(), key.c_str());
172
+ return id < 0 ? 0.0f : lm_gguf_get_val_f32(ctx_gguf.get(), id);
173
+ };
174
+ LLM_KV llm_kv = LLM_KV(LLM_ARCH_UNKNOWN);
175
+
176
+ auto general_type = get_kv_str(llm_kv(LLM_KV_GENERAL_TYPE));
177
+ if (general_type != "adapter") {
178
+ throw std::runtime_error("expect general.type to be 'adapter', but got: " + general_type);
179
+ }
180
+
181
+ auto general_arch_str = get_kv_str(llm_kv(LLM_KV_GENERAL_ARCHITECTURE));
182
+ auto general_arch = llm_arch_from_string(general_arch_str);
183
+ if (general_arch != model.arch) {
184
+ throw std::runtime_error("model arch and LoRA arch mismatch");
185
+ }
186
+
187
+ auto adapter_type = get_kv_str(llm_kv(LLM_KV_ADAPTER_TYPE));
188
+ if (adapter_type != "lora") {
189
+ throw std::runtime_error("expect adapter.type to be 'lora', but got: " + adapter_type);
190
+ }
191
+
192
+ adapter.alpha = get_kv_f32(llm_kv(LLM_KV_ADAPTER_LORA_ALPHA));
193
+ }
194
+
195
+ int n_tensors = lm_gguf_get_n_tensors(ctx_gguf.get());
196
+
197
+ // contexts for each buffer type
198
+ std::map<lm_ggml_backend_buffer_type_t, lm_ggml_context *> ctx_map;
199
+ auto ctx_for_buft = [&](lm_ggml_backend_buffer_type_t buft) -> lm_ggml_context * {
200
+ auto it = ctx_map.find(buft);
201
+ if (it == ctx_map.end()) {
202
+ // add a new context
203
+ lm_ggml_init_params params = {
204
+ /*.mem_size =*/ n_tensors*lm_ggml_tensor_overhead(),
205
+ /*.mem_buffer =*/ NULL,
206
+ /*.no_alloc =*/ true,
207
+ };
208
+ lm_ggml_context * buft_ctx = lm_ggml_init(params);
209
+ if (!buft_ctx) {
210
+ return nullptr;
211
+ }
212
+ ctx_map[buft] = buft_ctx;
213
+ adapter.ctxs.emplace_back(buft_ctx);
214
+ return buft_ctx;
215
+ };
216
+ return it->second;
217
+ };
218
+
219
+ // bundle lora_a and lora_b into pairs
220
+ std::map<std::string, llama_adapter_lora_weight> ab_map;
221
+ auto str_endswith = [](const std::string & str, const std::string & suffix) {
222
+ return str.size() >= suffix.size() && str.compare(str.size()-suffix.size(), suffix.size(), suffix) == 0;
223
+ };
224
+
225
+ for (lm_ggml_tensor * cur = lm_ggml_get_first_tensor(ctx.get()); cur; cur = lm_ggml_get_next_tensor(ctx.get(), cur)) {
226
+ std::string name(cur->name);
227
+ if (str_endswith(name, ".lora_a")) {
228
+ replace_all(name, ".lora_a", "");
229
+ if (ab_map.find(name) == ab_map.end()) {
230
+ ab_map[name] = llama_adapter_lora_weight(cur, nullptr);
231
+ } else {
232
+ ab_map[name].a = cur;
233
+ }
234
+ } else if (str_endswith(name, ".lora_b")) {
235
+ replace_all(name, ".lora_b", "");
236
+ if (ab_map.find(name) == ab_map.end()) {
237
+ ab_map[name] = llama_adapter_lora_weight(nullptr, cur);
238
+ } else {
239
+ ab_map[name].b = cur;
240
+ }
241
+ } else if (str_endswith(name, "_norm.weight")) {
242
+ // TODO: add support for norm vector
243
+ // for now, we don't really care because most adapters still work fine without it
244
+ continue;
245
+ } else {
246
+ throw std::runtime_error("LoRA tensor '" + name + "' has unexpected suffix");
247
+ }
248
+ }
249
+
250
+ // get extra buffer types of the CPU
251
+ // TODO: a more general solution for non-CPU extra buft should be imlpemented in the future
252
+ // ref: https://github.com/ggml-org/llama.cpp/pull/12593#pullrequestreview-2718659948
253
+ std::vector<lm_ggml_backend_buffer_type_t> buft_extra;
254
+ {
255
+ auto * cpu_dev = lm_ggml_backend_dev_by_type(LM_GGML_BACKEND_DEVICE_TYPE_CPU);
256
+ auto * cpu_reg = lm_ggml_backend_dev_backend_reg(cpu_dev);
257
+
258
+ auto lm_ggml_backend_dev_get_extra_bufts_fn = (lm_ggml_backend_dev_get_extra_bufts_t)
259
+ lm_ggml_backend_reg_get_proc_address(cpu_reg, "lm_ggml_backend_dev_get_extra_bufts");
260
+
261
+ if (lm_ggml_backend_dev_get_extra_bufts_fn) {
262
+ lm_ggml_backend_buffer_type_t * extra_bufts = lm_ggml_backend_dev_get_extra_bufts_fn(cpu_dev);
263
+ while (extra_bufts && *extra_bufts) {
264
+ buft_extra.emplace_back(*extra_bufts);
265
+ ++extra_bufts;
266
+ }
267
+ }
268
+ }
269
+
270
+ // add tensors
271
+ for (auto & it : ab_map) {
272
+ const std::string & name = it.first;
273
+ llama_adapter_lora_weight & w = it.second;
274
+ bool is_token_embd = str_endswith(name, "token_embd.weight");
275
+
276
+ if (!w.a || !w.b) {
277
+ throw std::runtime_error("LoRA tensor pair for '" + name + "' is missing one component");
278
+ }
279
+
280
+ // device buft and device ctx
281
+ const auto * model_tensor = model.get_tensor(name.c_str());
282
+ if (!model_tensor) {
283
+ throw std::runtime_error("LoRA tensor '" + name + "' does not exist in base model (hint: maybe wrong base model?)");
284
+ }
285
+
286
+ auto * buft = lm_ggml_backend_buffer_get_type(model_tensor->buffer);
287
+
288
+ // do not load loras to extra buffer types (i.e. bufts for repacking) -> use the CPU in that case
289
+ for (auto & ex : buft_extra) {
290
+ if (ex == buft) {
291
+ LLAMA_LOG_WARN("%s: lora for '%s' cannot use buft '%s', fallback to CPU\n", __func__, model_tensor->name, lm_ggml_backend_buft_name(buft));
292
+
293
+ auto * cpu_dev = lm_ggml_backend_dev_by_type(LM_GGML_BACKEND_DEVICE_TYPE_CPU);
294
+ buft = lm_ggml_backend_dev_buffer_type(cpu_dev);
295
+
296
+ break;
297
+ }
298
+ }
299
+
300
+ LLAMA_LOG_DEBUG("%s: lora for '%s' -> '%s'\n", __func__, model_tensor->name, lm_ggml_backend_buft_name(buft));
301
+
302
+ lm_ggml_context * dev_ctx = ctx_for_buft(buft);
303
+ // validate tensor shape
304
+ if (is_token_embd) {
305
+ // expect B to be non-transposed, A and B are flipped; see llm_build_inp_embd()
306
+ if (model_tensor->ne[0] != w.b->ne[1] || model_tensor->ne[1] != w.a->ne[1]) {
307
+ throw std::runtime_error("tensor '" + name + "' has incorrect shape (hint: maybe wrong base model?)");
308
+ }
309
+ } else {
310
+ if (model_tensor->ne[0] != w.a->ne[0] || model_tensor->ne[1] != w.b->ne[1]) {
311
+ throw std::runtime_error("tensor '" + name + "' has incorrect shape (hint: maybe wrong base model?)");
312
+ }
313
+ if (w.a->ne[1] != w.b->ne[0]) {
314
+ throw std::runtime_error("lora_a tensor is not transposed (hint: adapter from \"finetune\" example is no longer supported)");
315
+ }
316
+ }
317
+
318
+ // save tensor to adapter
319
+ lm_ggml_tensor * tensor_a = lm_ggml_dup_tensor(dev_ctx, w.a);
320
+ lm_ggml_tensor * tensor_b = lm_ggml_dup_tensor(dev_ctx, w.b);
321
+ lm_ggml_set_name(tensor_a, w.a->name);
322
+ lm_ggml_set_name(tensor_b, w.b->name);
323
+ adapter.ab_map[name] = llama_adapter_lora_weight(tensor_a, tensor_b);
324
+ }
325
+
326
+ // allocate tensors / buffers and zero
327
+ {
328
+ adapter.ctxs.reserve(ctx_map.size());
329
+ adapter.bufs.reserve(ctx_map.size());
330
+ for (auto & it : ctx_map) {
331
+ lm_ggml_backend_buffer_type_t buft = it.first;
332
+ lm_ggml_context * ctx_dev = it.second;
333
+ lm_ggml_backend_buffer_ptr buf { lm_ggml_backend_alloc_ctx_tensors_from_buft(ctx_dev, buft) };
334
+ if (!buf) {
335
+ throw std::runtime_error("failed to allocate buffer for lora adapter\n");
336
+ }
337
+ LLAMA_LOG_INFO("%s: %10s LoRA buffer size = %8.2f MiB\n", __func__, lm_ggml_backend_buffer_name(buf.get()), lm_ggml_backend_buffer_get_size(buf.get())/1024.0/1024.0);
338
+ adapter.bufs.emplace_back(std::move(buf));
339
+ }
340
+ }
341
+
342
+ // set tensor data
343
+ {
344
+ llama_file lm_gguf_file(path_lora, "rb");
345
+ std::vector<uint8_t> read_buf;
346
+ auto set_tensor = [&](lm_ggml_tensor * orig, lm_ggml_tensor * dev) {
347
+ size_t offs = lm_gguf_get_data_offset(ctx_gguf.get()) + lm_gguf_get_tensor_offset(ctx_gguf.get(), lm_gguf_find_tensor(ctx_gguf.get(), orig->name));
348
+ size_t size = lm_ggml_nbytes(orig);
349
+ read_buf.resize(size);
350
+ lm_gguf_file.seek(offs, SEEK_SET);
351
+ lm_gguf_file.read_raw(read_buf.data(), size);
352
+ lm_ggml_backend_tensor_set(dev, read_buf.data(), 0, size);
353
+ };
354
+ for (auto & it : adapter.ab_map) {
355
+ auto orig = ab_map[it.first];
356
+ auto dev = it.second;
357
+ set_tensor(orig.a, dev.a);
358
+ set_tensor(orig.b, dev.b);
359
+ }
360
+ }
361
+
362
+ LLAMA_LOG_INFO("%s: loaded %zu tensors from lora file\n", __func__, adapter.ab_map.size()*2);
363
+ }
364
+
365
+ llama_adapter_lora * llama_adapter_lora_init(llama_model * model, const char * path_lora) {
366
+ llama_adapter_lora * adapter = new llama_adapter_lora();
367
+
368
+ try {
369
+ llama_adapter_lora_init_impl(*model, path_lora, *adapter);
370
+ return adapter;
371
+ } catch (const std::exception & err) {
372
+ LLAMA_LOG_ERROR("%s: failed to apply lora adapter: %s\n", __func__, err.what());
373
+
374
+ delete adapter;
375
+ }
376
+
377
+ return nullptr;
378
+ }
379
+
380
+ void llama_adapter_lora_free(llama_adapter_lora * adapter) {
381
+ delete adapter;
382
+ }