cui-llama.rn 1.4.4 → 1.4.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (197) hide show
  1. package/android/src/main/CMakeLists.txt +2 -2
  2. package/android/src/main/jni.cpp +12 -10
  3. package/android/src/main/jniLibs/arm64-v8a/librnllama.so +0 -0
  4. package/android/src/main/jniLibs/arm64-v8a/librnllama_v8.so +0 -0
  5. package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2.so +0 -0
  6. package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2_dotprod.so +0 -0
  7. package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2_dotprod_i8mm.so +0 -0
  8. package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2_i8mm.so +0 -0
  9. package/android/src/main/jniLibs/x86_64/librnllama.so +0 -0
  10. package/android/src/main/jniLibs/x86_64/librnllama_x86_64.so +0 -0
  11. package/cpp/chat-template.hpp +529 -529
  12. package/cpp/chat.cpp +959 -265
  13. package/cpp/chat.h +135 -0
  14. package/cpp/common.cpp +2064 -1996
  15. package/cpp/common.h +700 -744
  16. package/cpp/ggml-alloc.c +1039 -1030
  17. package/cpp/ggml-alloc.h +1 -1
  18. package/cpp/ggml-backend-impl.h +255 -255
  19. package/cpp/ggml-backend-reg.cpp +586 -582
  20. package/cpp/ggml-backend.cpp +2004 -2002
  21. package/cpp/ggml-backend.h +354 -354
  22. package/cpp/ggml-common.h +1851 -1851
  23. package/cpp/ggml-cpp.h +39 -39
  24. package/cpp/ggml-cpu-aarch64.cpp +4248 -4247
  25. package/cpp/ggml-cpu-aarch64.h +8 -8
  26. package/cpp/ggml-cpu-impl.h +531 -380
  27. package/cpp/ggml-cpu-quants.c +12527 -11517
  28. package/cpp/ggml-cpu-traits.cpp +36 -36
  29. package/cpp/ggml-cpu-traits.h +38 -38
  30. package/cpp/ggml-cpu.c +15766 -14485
  31. package/cpp/ggml-cpu.cpp +655 -633
  32. package/cpp/ggml-cpu.h +138 -135
  33. package/cpp/ggml-impl.h +567 -567
  34. package/cpp/ggml-metal-impl.h +235 -0
  35. package/cpp/ggml-metal.h +66 -66
  36. package/cpp/ggml-metal.m +5146 -5002
  37. package/cpp/ggml-opt.cpp +854 -854
  38. package/cpp/ggml-opt.h +216 -216
  39. package/cpp/ggml-quants.c +5238 -5238
  40. package/cpp/ggml-threading.h +14 -14
  41. package/cpp/ggml.c +6529 -6524
  42. package/cpp/ggml.h +2198 -2194
  43. package/cpp/gguf.cpp +1329 -1329
  44. package/cpp/gguf.h +202 -202
  45. package/cpp/json-schema-to-grammar.cpp +1024 -1025
  46. package/cpp/json-schema-to-grammar.h +21 -22
  47. package/cpp/json.hpp +24766 -24766
  48. package/cpp/llama-adapter.cpp +347 -347
  49. package/cpp/llama-adapter.h +74 -74
  50. package/cpp/llama-arch.cpp +1513 -1492
  51. package/cpp/llama-arch.h +403 -402
  52. package/cpp/llama-batch.cpp +368 -368
  53. package/cpp/llama-batch.h +88 -88
  54. package/cpp/llama-chat.cpp +588 -587
  55. package/cpp/llama-chat.h +53 -53
  56. package/cpp/llama-context.cpp +1775 -1775
  57. package/cpp/llama-context.h +128 -128
  58. package/cpp/llama-cparams.cpp +1 -1
  59. package/cpp/llama-cparams.h +37 -37
  60. package/cpp/llama-cpp.h +30 -30
  61. package/cpp/llama-grammar.cpp +1219 -1219
  62. package/cpp/llama-grammar.h +173 -164
  63. package/cpp/llama-hparams.cpp +71 -71
  64. package/cpp/llama-hparams.h +139 -139
  65. package/cpp/llama-impl.cpp +167 -167
  66. package/cpp/llama-impl.h +61 -61
  67. package/cpp/llama-kv-cache.cpp +718 -718
  68. package/cpp/llama-kv-cache.h +219 -218
  69. package/cpp/llama-mmap.cpp +600 -590
  70. package/cpp/llama-mmap.h +68 -68
  71. package/cpp/llama-model-loader.cpp +1124 -1124
  72. package/cpp/llama-model-loader.h +167 -167
  73. package/cpp/llama-model.cpp +4087 -4023
  74. package/cpp/llama-model.h +370 -370
  75. package/cpp/llama-sampling.cpp +2558 -2525
  76. package/cpp/llama-sampling.h +32 -32
  77. package/cpp/llama-vocab.cpp +3264 -3252
  78. package/cpp/llama-vocab.h +125 -125
  79. package/cpp/llama.cpp +10284 -10137
  80. package/cpp/llama.h +1354 -1340
  81. package/cpp/log.cpp +393 -423
  82. package/cpp/log.h +132 -132
  83. package/cpp/minja/chat-template.hpp +529 -0
  84. package/cpp/minja/minja.hpp +2915 -0
  85. package/cpp/minja.hpp +2915 -2883
  86. package/cpp/rn-llama.cpp +20 -37
  87. package/cpp/rn-llama.h +12 -2
  88. package/cpp/sampling.cpp +570 -532
  89. package/cpp/sgemm.cpp +2598 -2598
  90. package/cpp/sgemm.h +14 -14
  91. package/cpp/speculative.cpp +278 -277
  92. package/cpp/speculative.h +28 -28
  93. package/package.json +1 -1
  94. package/android/src/main/build-arm64/CMakeCache.txt +0 -429
  95. package/android/src/main/build-arm64/CMakeFiles/3.31.4/CMakeCCompiler.cmake +0 -81
  96. package/android/src/main/build-arm64/CMakeFiles/3.31.4/CMakeCXXCompiler.cmake +0 -101
  97. package/android/src/main/build-arm64/CMakeFiles/3.31.4/CMakeDetermineCompilerABI_C.bin +0 -0
  98. package/android/src/main/build-arm64/CMakeFiles/3.31.4/CMakeDetermineCompilerABI_CXX.bin +0 -0
  99. package/android/src/main/build-arm64/CMakeFiles/3.31.4/CMakeSystem.cmake +0 -15
  100. package/android/src/main/build-arm64/CMakeFiles/3.31.4/CompilerIdC/CMakeCCompilerId.c +0 -904
  101. package/android/src/main/build-arm64/CMakeFiles/3.31.4/CompilerIdC/CMakeCCompilerId.o +0 -0
  102. package/android/src/main/build-arm64/CMakeFiles/3.31.4/CompilerIdCXX/CMakeCXXCompilerId.cpp +0 -919
  103. package/android/src/main/build-arm64/CMakeFiles/3.31.4/CompilerIdCXX/CMakeCXXCompilerId.o +0 -0
  104. package/android/src/main/build-arm64/CMakeFiles/CMakeConfigureLog.yaml +0 -431
  105. package/android/src/main/build-arm64/CMakeFiles/CMakeDirectoryInformation.cmake +0 -16
  106. package/android/src/main/build-arm64/CMakeFiles/Makefile.cmake +0 -165
  107. package/android/src/main/build-arm64/CMakeFiles/Makefile2 +0 -297
  108. package/android/src/main/build-arm64/CMakeFiles/Progress/1 +0 -1
  109. package/android/src/main/build-arm64/CMakeFiles/Progress/2 +0 -1
  110. package/android/src/main/build-arm64/CMakeFiles/Progress/3 +0 -1
  111. package/android/src/main/build-arm64/CMakeFiles/Progress/4 +0 -1
  112. package/android/src/main/build-arm64/CMakeFiles/Progress/5 +0 -1
  113. package/android/src/main/build-arm64/CMakeFiles/Progress/6 +0 -1
  114. package/android/src/main/build-arm64/CMakeFiles/Progress/count.txt +0 -1
  115. package/android/src/main/build-arm64/CMakeFiles/TargetDirectories.txt +0 -8
  116. package/android/src/main/build-arm64/CMakeFiles/cmake.check_cache +0 -1
  117. package/android/src/main/build-arm64/CMakeFiles/progress.marks +0 -1
  118. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-alloc.c.o +0 -0
  119. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-alloc.c.o.d +0 -58
  120. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-backend-reg.cpp.o +0 -0
  121. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-backend-reg.cpp.o.d +0 -756
  122. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-backend.cpp.o +0 -0
  123. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-backend.cpp.o.d +0 -709
  124. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-cpu-aarch64.cpp.o +0 -0
  125. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-cpu-aarch64.cpp.o.d +0 -714
  126. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-cpu-quants.c.o +0 -0
  127. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-cpu-quants.c.o.d +0 -62
  128. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-cpu-traits.cpp.o +0 -0
  129. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-cpu-traits.cpp.o.d +0 -708
  130. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-cpu.c.o +0 -0
  131. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-cpu.c.o.d +0 -113
  132. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-cpu.cpp.o +0 -0
  133. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-cpu.cpp.o.d +0 -713
  134. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-opt.cpp.o +0 -0
  135. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-opt.cpp.o.d +0 -763
  136. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-quants.c.o +0 -0
  137. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-quants.c.o.d +0 -61
  138. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-threading.cpp.o +0 -0
  139. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-threading.cpp.o.d +0 -707
  140. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml.c.o +0 -0
  141. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml.c.o.d +0 -104
  142. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/gguf.cpp.o +0 -0
  143. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/gguf.cpp.o.d +0 -714
  144. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/log.cpp.o +0 -0
  145. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/log.cpp.o.d +0 -723
  146. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/DependInfo.cmake +0 -62
  147. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/build.make +0 -722
  148. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/cmake_clean.cmake +0 -89
  149. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/compiler_depend.make +0 -2
  150. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/compiler_depend.ts +0 -2
  151. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/depend.make +0 -2
  152. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/flags.make +0 -17
  153. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/progress.make +0 -41
  154. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8.dir/DependInfo.cmake +0 -62
  155. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8.dir/build.make +0 -722
  156. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8.dir/cmake_clean.cmake +0 -89
  157. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8.dir/compiler_depend.make +0 -2
  158. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8.dir/compiler_depend.ts +0 -2
  159. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8.dir/depend.make +0 -2
  160. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8.dir/flags.make +0 -17
  161. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8.dir/progress.make +0 -41
  162. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2.dir/DependInfo.cmake +0 -62
  163. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2.dir/build.make +0 -722
  164. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2.dir/cmake_clean.cmake +0 -89
  165. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2.dir/compiler_depend.make +0 -2
  166. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2.dir/compiler_depend.ts +0 -2
  167. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2.dir/depend.make +0 -2
  168. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2.dir/flags.make +0 -17
  169. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2.dir/progress.make +0 -41
  170. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_dotprod.dir/DependInfo.cmake +0 -62
  171. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_dotprod.dir/build.make +0 -722
  172. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_dotprod.dir/cmake_clean.cmake +0 -89
  173. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_dotprod.dir/compiler_depend.make +0 -2
  174. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_dotprod.dir/compiler_depend.ts +0 -2
  175. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_dotprod.dir/depend.make +0 -2
  176. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_dotprod.dir/flags.make +0 -17
  177. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_dotprod.dir/progress.make +0 -41
  178. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_dotprod_i8mm.dir/DependInfo.cmake +0 -62
  179. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_dotprod_i8mm.dir/build.make +0 -722
  180. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_dotprod_i8mm.dir/cmake_clean.cmake +0 -89
  181. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_dotprod_i8mm.dir/compiler_depend.make +0 -2
  182. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_dotprod_i8mm.dir/compiler_depend.ts +0 -2
  183. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_dotprod_i8mm.dir/depend.make +0 -2
  184. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_dotprod_i8mm.dir/flags.make +0 -17
  185. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_dotprod_i8mm.dir/progress.make +0 -41
  186. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_i8mm.dir/DependInfo.cmake +0 -62
  187. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_i8mm.dir/build.make +0 -722
  188. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_i8mm.dir/cmake_clean.cmake +0 -89
  189. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_i8mm.dir/compiler_depend.make +0 -2
  190. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_i8mm.dir/compiler_depend.ts +0 -2
  191. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_i8mm.dir/depend.make +0 -2
  192. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_i8mm.dir/flags.make +0 -17
  193. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_i8mm.dir/progress.make +0 -41
  194. package/android/src/main/build-arm64/Makefile +0 -1862
  195. package/android/src/main/build-arm64/cmake_install.cmake +0 -66
  196. package/cpp/chat.hpp +0 -55
  197. package/cpp/rn-llama.hpp +0 -913
@@ -1,347 +1,347 @@
1
- #include "llama-adapter.h"
2
-
3
- #include "llama-impl.h"
4
- #include "llama-mmap.h"
5
- #include "llama-model.h"
6
-
7
- #include <algorithm>
8
- #include <map>
9
- #include <cassert>
10
- #include <stdexcept>
11
-
12
- // vec
13
-
14
- struct lm_ggml_tensor * llama_adapter_cvec::tensor_for(int il) const {
15
- if (il < 0 || il < layer_start || il > layer_end || (size_t) il >= tensors.size()) {
16
- return nullptr;
17
- }
18
-
19
- return tensors[il];
20
- }
21
-
22
- struct lm_ggml_tensor * llama_adapter_cvec::apply_to(struct lm_ggml_context * ctx, struct lm_ggml_tensor * cur, int il) const {
23
- lm_ggml_tensor * layer_dir = tensor_for(il);
24
- if (layer_dir != nullptr) {
25
- cur = lm_ggml_add(ctx, cur, layer_dir);
26
- }
27
-
28
- return cur;
29
- }
30
-
31
- bool llama_adapter_cvec::init(const llama_model & model) {
32
- const auto & hparams = model.hparams;
33
-
34
- LM_GGML_ASSERT(tensors.empty());
35
- LM_GGML_ASSERT(ctxs.empty());
36
- LM_GGML_ASSERT(bufs.empty());
37
-
38
- // create a context for each buffer type
39
- std::map<lm_ggml_backend_buffer_type_t, lm_ggml_context *> ctx_map;
40
- auto ctx_for_buft = [&](lm_ggml_backend_buffer_type_t buft) -> lm_ggml_context * {
41
- auto it = ctx_map.find(buft);
42
- if (it == ctx_map.end()) {
43
- struct lm_ggml_init_params params = {
44
- /*.mem_size =*/ hparams.n_layer*lm_ggml_tensor_overhead(),
45
- /*.mem_buffer =*/ NULL,
46
- /*.no_alloc =*/ true,
47
- };
48
-
49
- lm_ggml_context * ctx = lm_ggml_init(params);
50
- if (!ctx) {
51
- return nullptr;
52
- }
53
-
54
- ctx_map[buft] = ctx;
55
- ctxs.emplace_back(ctx);
56
-
57
- return ctx;
58
- }
59
-
60
- return it->second;
61
- };
62
-
63
- // make tensors
64
- tensors.reserve(hparams.n_layer);
65
- tensors.push_back(nullptr); // there's never a tensor for layer 0
66
- for (size_t il = 1; il < hparams.n_layer; il++) {
67
- lm_ggml_backend_buffer_type_t buft = model.select_buft(il);
68
- lm_ggml_context * ctx = ctx_for_buft(buft);
69
- if (!ctx) {
70
- LLAMA_LOG_ERROR("%s: failed to allocate context for control vector\n", __func__);
71
- return false;
72
- }
73
- lm_ggml_tensor * tensor = lm_ggml_new_tensor_1d(ctx, LM_GGML_TYPE_F32, hparams.n_embd);
74
- tensors.push_back(tensor);
75
- }
76
-
77
- // allocate tensors / buffers and zero
78
- bufs.reserve(ctx_map.size());
79
- for (auto it : ctx_map) {
80
- lm_ggml_backend_buffer_type_t buft = it.first;
81
- lm_ggml_context * ctx = it.second;
82
- lm_ggml_backend_buffer_t buf = lm_ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft);
83
- if (!buf) {
84
- LLAMA_LOG_ERROR("%s: failed to allocate buffer for control vector\n", __func__);
85
- return false;
86
- }
87
- lm_ggml_backend_buffer_clear(buf, 0);
88
- bufs.emplace_back(buf);
89
- }
90
-
91
- return true;
92
- }
93
-
94
- int32_t llama_adapter_cvec::apply(
95
- const llama_model & model,
96
- const float * data,
97
- size_t len,
98
- int32_t n_embd,
99
- int32_t il_start,
100
- int32_t il_end) {
101
- const auto & hparams = model.hparams;
102
-
103
- if (data == nullptr) {
104
- // disable the current control vector (but leave allocated for later)
105
- layer_start = -1;
106
- layer_end = -1;
107
- return 0;
108
- }
109
-
110
- if (n_embd != (int) hparams.n_embd) {
111
- LLAMA_LOG_ERROR("%s: control vector n_embd does not match model\n", __func__);
112
- return 1;
113
- }
114
-
115
- if (tensors.empty()) {
116
- if (!init(model)) {
117
- return 1;
118
- }
119
- }
120
-
121
- layer_start = il_start;
122
- layer_end = il_end;
123
-
124
- for (size_t il = 1; il < hparams.n_layer; il++) {
125
- assert(tensors[il] != nullptr);
126
-
127
- const size_t off = n_embd * (il - 1); // buffer doesn't have data for layer 0, since it's never present
128
- if (off + n_embd <= len) {
129
- lm_ggml_backend_tensor_set(tensors[il], data + off, 0, n_embd * lm_ggml_element_size(tensors[il]));
130
- }
131
- }
132
-
133
- return 0;
134
- }
135
-
136
- // lora
137
-
138
- llama_adapter_lora_weight * llama_adapter_lora::get_weight(struct lm_ggml_tensor * w) {
139
- const std::string name(w->name);
140
-
141
- const auto pos = ab_map.find(name);
142
- if (pos != ab_map.end()) {
143
- return &pos->second;
144
- }
145
-
146
- return nullptr;
147
- }
148
-
149
- static void llama_adapter_lora_init_impl(struct llama_model & model, const char * path_lora, struct llama_adapter_lora & adapter) {
150
- LLAMA_LOG_INFO("%s: loading lora adapter from '%s' ...\n", __func__, path_lora);
151
-
152
- lm_ggml_context * ctx_init;
153
- struct lm_gguf_init_params meta_lm_gguf_params = {
154
- /* .no_alloc = */ true,
155
- /* .ctx = */ &ctx_init,
156
- };
157
-
158
- lm_gguf_context_ptr ctx_gguf { lm_gguf_init_from_file(path_lora, meta_lm_gguf_params) };
159
- if (!ctx_gguf) {
160
- throw std::runtime_error("failed to load lora adapter file from " + std::string(path_lora));
161
- }
162
-
163
- lm_ggml_context_ptr ctx { ctx_init };
164
-
165
- // check metadata
166
- {
167
- auto get_kv_str = [&](const std::string & key) -> std::string {
168
- int id = lm_gguf_find_key(ctx_gguf.get(), key.c_str());
169
- return id < 0 ? "" : std::string(lm_gguf_get_val_str(ctx_gguf.get(), id));
170
- };
171
- auto get_kv_f32 = [&](const std::string & key) -> float {
172
- int id = lm_gguf_find_key(ctx_gguf.get(), key.c_str());
173
- return id < 0 ? 0.0f : lm_gguf_get_val_f32(ctx_gguf.get(), id);
174
- };
175
- LLM_KV llm_kv = LLM_KV(LLM_ARCH_UNKNOWN);
176
-
177
- auto general_type = get_kv_str(llm_kv(LLM_KV_GENERAL_TYPE));
178
- if (general_type != "adapter") {
179
- throw std::runtime_error("expect general.type to be 'adapter', but got: " + general_type);
180
- }
181
-
182
- auto general_arch_str = get_kv_str(llm_kv(LLM_KV_GENERAL_ARCHITECTURE));
183
- auto general_arch = llm_arch_from_string(general_arch_str);
184
- if (general_arch != model.arch) {
185
- throw std::runtime_error("model arch and LoRA arch mismatch");
186
- }
187
-
188
- auto adapter_type = get_kv_str(llm_kv(LLM_KV_ADAPTER_TYPE));
189
- if (adapter_type != "lora") {
190
- throw std::runtime_error("expect adapter.type to be 'lora', but got: " + adapter_type);
191
- }
192
-
193
- adapter.alpha = get_kv_f32(llm_kv(LLM_KV_ADAPTER_LORA_ALPHA));
194
- }
195
-
196
- int n_tensors = lm_gguf_get_n_tensors(ctx_gguf.get());
197
-
198
- // contexts for each buffer type
199
- std::map<lm_ggml_backend_buffer_type_t, lm_ggml_context *> ctx_map;
200
- auto ctx_for_buft = [&](lm_ggml_backend_buffer_type_t buft) -> lm_ggml_context * {
201
- auto it = ctx_map.find(buft);
202
- if (it == ctx_map.end()) {
203
- // add a new context
204
- struct lm_ggml_init_params params = {
205
- /*.mem_size =*/ n_tensors*lm_ggml_tensor_overhead(),
206
- /*.mem_buffer =*/ NULL,
207
- /*.no_alloc =*/ true,
208
- };
209
- lm_ggml_context * buft_ctx = lm_ggml_init(params);
210
- if (!buft_ctx) {
211
- return nullptr;
212
- }
213
- ctx_map[buft] = buft_ctx;
214
- adapter.ctxs.emplace_back(buft_ctx);
215
- return buft_ctx;
216
- };
217
- return it->second;
218
- };
219
-
220
- // bundle lora_a and lora_b into pairs
221
- std::map<std::string, llama_adapter_lora_weight> ab_map;
222
- auto str_endswith = [](const std::string & str, const std::string & suffix) {
223
- return str.size() >= suffix.size() && str.compare(str.size()-suffix.size(), suffix.size(), suffix) == 0;
224
- };
225
-
226
- for (lm_ggml_tensor * cur = lm_ggml_get_first_tensor(ctx.get()); cur; cur = lm_ggml_get_next_tensor(ctx.get(), cur)) {
227
- std::string name(cur->name);
228
- if (str_endswith(name, ".lora_a")) {
229
- replace_all(name, ".lora_a", "");
230
- if (ab_map.find(name) == ab_map.end()) {
231
- ab_map[name] = llama_adapter_lora_weight(cur, nullptr);
232
- } else {
233
- ab_map[name].a = cur;
234
- }
235
- } else if (str_endswith(name, ".lora_b")) {
236
- replace_all(name, ".lora_b", "");
237
- if (ab_map.find(name) == ab_map.end()) {
238
- ab_map[name] = llama_adapter_lora_weight(nullptr, cur);
239
- } else {
240
- ab_map[name].b = cur;
241
- }
242
- } else if (str_endswith(name, "_norm.weight")) {
243
- // TODO: add support for norm vector
244
- // for now, we don't really care because most adapters still work fine without it
245
- continue;
246
- } else {
247
- throw std::runtime_error("LoRA tensor '" + name + "' has unexpected suffix");
248
- }
249
- }
250
-
251
- // add tensors
252
- for (auto & it : ab_map) {
253
- const std::string & name = it.first;
254
- llama_adapter_lora_weight & w = it.second;
255
- bool is_token_embd = str_endswith(name, "token_embd.weight");
256
-
257
- if (!w.a || !w.b) {
258
- throw std::runtime_error("LoRA tensor pair for '" + name + "' is missing one component");
259
- }
260
-
261
- // device buft and device ctx
262
- const auto * model_tensor = model.get_tensor(name.c_str());
263
- if (!model_tensor) {
264
- throw std::runtime_error("LoRA tensor '" + name + "' does not exist in base model (hint: maybe wrong base model?)");
265
- }
266
-
267
- struct lm_ggml_context * dev_ctx = ctx_for_buft(lm_ggml_backend_buffer_get_type(model_tensor->buffer));
268
- // validate tensor shape
269
- if (is_token_embd) {
270
- // expect B to be non-transposed, A and B are flipped; see llm_build_inp_embd()
271
- if (model_tensor->ne[0] != w.b->ne[1] || model_tensor->ne[1] != w.a->ne[1]) {
272
- throw std::runtime_error("tensor '" + name + "' has incorrect shape (hint: maybe wrong base model?)");
273
- }
274
- } else {
275
- if (model_tensor->ne[0] != w.a->ne[0] || model_tensor->ne[1] != w.b->ne[1]) {
276
- throw std::runtime_error("tensor '" + name + "' has incorrect shape (hint: maybe wrong base model?)");
277
- }
278
- if (w.a->ne[1] != w.b->ne[0]) {
279
- throw std::runtime_error("lora_a tensor is not transposed (hint: adapter from \"finetune\" example is no longer supported)");
280
- }
281
- }
282
-
283
- // save tensor to adapter
284
- struct lm_ggml_tensor * tensor_a = lm_ggml_dup_tensor(dev_ctx, w.a);
285
- struct lm_ggml_tensor * tensor_b = lm_ggml_dup_tensor(dev_ctx, w.b);
286
- lm_ggml_set_name(tensor_a, w.a->name);
287
- lm_ggml_set_name(tensor_b, w.b->name);
288
- adapter.ab_map[name] = llama_adapter_lora_weight(tensor_a, tensor_b);
289
- }
290
-
291
- // allocate tensors / buffers and zero
292
- {
293
- adapter.ctxs.reserve(ctx_map.size());
294
- adapter.bufs.reserve(ctx_map.size());
295
- for (auto & it : ctx_map) {
296
- lm_ggml_backend_buffer_type_t buft = it.first;
297
- lm_ggml_context * ctx_dev = it.second;
298
- lm_ggml_backend_buffer_ptr buf { lm_ggml_backend_alloc_ctx_tensors_from_buft(ctx_dev, buft) };
299
- if (!buf) {
300
- throw std::runtime_error("failed to allocate buffer for lora adapter\n");
301
- }
302
- LLAMA_LOG_INFO("%s: %10s LoRA buffer size = %8.2f MiB\n", __func__, lm_ggml_backend_buffer_name(buf.get()), lm_ggml_backend_buffer_get_size(buf.get())/1024.0/1024.0);
303
- adapter.bufs.emplace_back(std::move(buf));
304
- }
305
- }
306
-
307
- // set tensor data
308
- {
309
- llama_file lm_gguf_file(path_lora, "rb");
310
- std::vector<uint8_t> read_buf;
311
- auto set_tensor = [&](struct lm_ggml_tensor * orig, struct lm_ggml_tensor * dev) {
312
- size_t offs = lm_gguf_get_data_offset(ctx_gguf.get()) + lm_gguf_get_tensor_offset(ctx_gguf.get(), lm_gguf_find_tensor(ctx_gguf.get(), orig->name));
313
- size_t size = lm_ggml_nbytes(orig);
314
- read_buf.resize(size);
315
- lm_gguf_file.seek(offs, SEEK_SET);
316
- lm_gguf_file.read_raw(read_buf.data(), size);
317
- lm_ggml_backend_tensor_set(dev, read_buf.data(), 0, size);
318
- };
319
- for (auto & it : adapter.ab_map) {
320
- auto orig = ab_map[it.first];
321
- auto dev = it.second;
322
- set_tensor(orig.a, dev.a);
323
- set_tensor(orig.b, dev.b);
324
- }
325
- }
326
-
327
- LLAMA_LOG_INFO("%s: loaded %zu tensors from lora file\n", __func__, adapter.ab_map.size()*2);
328
- }
329
-
330
- struct llama_adapter_lora * llama_adapter_lora_init(struct llama_model * model, const char * path_lora) {
331
- struct llama_adapter_lora * adapter = new llama_adapter_lora();
332
-
333
- try {
334
- llama_adapter_lora_init_impl(*model, path_lora, *adapter);
335
- return adapter;
336
- } catch (const std::exception & err) {
337
- LLAMA_LOG_ERROR("%s: failed to apply lora adapter: %s\n", __func__, err.what());
338
-
339
- delete adapter;
340
- }
341
-
342
- return nullptr;
343
- }
344
-
345
- void llama_adapter_lora_free(struct llama_adapter_lora * adapter) {
346
- delete adapter;
347
- }
1
+ #include "llama-adapter.h"
2
+
3
+ #include "llama-impl.h"
4
+ #include "llama-mmap.h"
5
+ #include "llama-model.h"
6
+
7
+ #include <algorithm>
8
+ #include <map>
9
+ #include <cassert>
10
+ #include <stdexcept>
11
+
12
+ // vec
13
+
14
+ struct lm_ggml_tensor * llama_adapter_cvec::tensor_for(int il) const {
15
+ if (il < 0 || il < layer_start || il > layer_end || (size_t) il >= tensors.size()) {
16
+ return nullptr;
17
+ }
18
+
19
+ return tensors[il];
20
+ }
21
+
22
+ struct lm_ggml_tensor * llama_adapter_cvec::apply_to(struct lm_ggml_context * ctx, struct lm_ggml_tensor * cur, int il) const {
23
+ lm_ggml_tensor * layer_dir = tensor_for(il);
24
+ if (layer_dir != nullptr) {
25
+ cur = lm_ggml_add(ctx, cur, layer_dir);
26
+ }
27
+
28
+ return cur;
29
+ }
30
+
31
+ bool llama_adapter_cvec::init(const llama_model & model) {
32
+ const auto & hparams = model.hparams;
33
+
34
+ LM_GGML_ASSERT(tensors.empty());
35
+ LM_GGML_ASSERT(ctxs.empty());
36
+ LM_GGML_ASSERT(bufs.empty());
37
+
38
+ // create a context for each buffer type
39
+ std::map<lm_ggml_backend_buffer_type_t, lm_ggml_context *> ctx_map;
40
+ auto ctx_for_buft = [&](lm_ggml_backend_buffer_type_t buft) -> lm_ggml_context * {
41
+ auto it = ctx_map.find(buft);
42
+ if (it == ctx_map.end()) {
43
+ struct lm_ggml_init_params params = {
44
+ /*.mem_size =*/ hparams.n_layer*lm_ggml_tensor_overhead(),
45
+ /*.mem_buffer =*/ NULL,
46
+ /*.no_alloc =*/ true,
47
+ };
48
+
49
+ lm_ggml_context * ctx = lm_ggml_init(params);
50
+ if (!ctx) {
51
+ return nullptr;
52
+ }
53
+
54
+ ctx_map[buft] = ctx;
55
+ ctxs.emplace_back(ctx);
56
+
57
+ return ctx;
58
+ }
59
+
60
+ return it->second;
61
+ };
62
+
63
+ // make tensors
64
+ tensors.reserve(hparams.n_layer);
65
+ tensors.push_back(nullptr); // there's never a tensor for layer 0
66
+ for (size_t il = 1; il < hparams.n_layer; il++) {
67
+ lm_ggml_backend_buffer_type_t buft = model.select_buft(il);
68
+ lm_ggml_context * ctx = ctx_for_buft(buft);
69
+ if (!ctx) {
70
+ LLAMA_LOG_ERROR("%s: failed to allocate context for control vector\n", __func__);
71
+ return false;
72
+ }
73
+ lm_ggml_tensor * tensor = lm_ggml_new_tensor_1d(ctx, LM_GGML_TYPE_F32, hparams.n_embd);
74
+ tensors.push_back(tensor);
75
+ }
76
+
77
+ // allocate tensors / buffers and zero
78
+ bufs.reserve(ctx_map.size());
79
+ for (auto it : ctx_map) {
80
+ lm_ggml_backend_buffer_type_t buft = it.first;
81
+ lm_ggml_context * ctx = it.second;
82
+ lm_ggml_backend_buffer_t buf = lm_ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft);
83
+ if (!buf) {
84
+ LLAMA_LOG_ERROR("%s: failed to allocate buffer for control vector\n", __func__);
85
+ return false;
86
+ }
87
+ lm_ggml_backend_buffer_clear(buf, 0);
88
+ bufs.emplace_back(buf);
89
+ }
90
+
91
+ return true;
92
+ }
93
+
94
+ int32_t llama_adapter_cvec::apply(
95
+ const llama_model & model,
96
+ const float * data,
97
+ size_t len,
98
+ int32_t n_embd,
99
+ int32_t il_start,
100
+ int32_t il_end) {
101
+ const auto & hparams = model.hparams;
102
+
103
+ if (data == nullptr) {
104
+ // disable the current control vector (but leave allocated for later)
105
+ layer_start = -1;
106
+ layer_end = -1;
107
+ return 0;
108
+ }
109
+
110
+ if (n_embd != (int) hparams.n_embd) {
111
+ LLAMA_LOG_ERROR("%s: control vector n_embd does not match model\n", __func__);
112
+ return 1;
113
+ }
114
+
115
+ if (tensors.empty()) {
116
+ if (!init(model)) {
117
+ return 1;
118
+ }
119
+ }
120
+
121
+ layer_start = il_start;
122
+ layer_end = il_end;
123
+
124
+ for (size_t il = 1; il < hparams.n_layer; il++) {
125
+ assert(tensors[il] != nullptr);
126
+
127
+ const size_t off = n_embd * (il - 1); // buffer doesn't have data for layer 0, since it's never present
128
+ if (off + n_embd <= len) {
129
+ lm_ggml_backend_tensor_set(tensors[il], data + off, 0, n_embd * lm_ggml_element_size(tensors[il]));
130
+ }
131
+ }
132
+
133
+ return 0;
134
+ }
135
+
136
+ // lora
137
+
138
+ llama_adapter_lora_weight * llama_adapter_lora::get_weight(struct lm_ggml_tensor * w) {
139
+ const std::string name(w->name);
140
+
141
+ const auto pos = ab_map.find(name);
142
+ if (pos != ab_map.end()) {
143
+ return &pos->second;
144
+ }
145
+
146
+ return nullptr;
147
+ }
148
+
149
+ static void llama_adapter_lora_init_impl(struct llama_model & model, const char * path_lora, struct llama_adapter_lora & adapter) {
150
+ LLAMA_LOG_INFO("%s: loading lora adapter from '%s' ...\n", __func__, path_lora);
151
+
152
+ lm_ggml_context * ctx_init;
153
+ struct lm_gguf_init_params meta_lm_gguf_params = {
154
+ /* .no_alloc = */ true,
155
+ /* .ctx = */ &ctx_init,
156
+ };
157
+
158
+ lm_gguf_context_ptr ctx_gguf { lm_gguf_init_from_file(path_lora, meta_lm_gguf_params) };
159
+ if (!ctx_gguf) {
160
+ throw std::runtime_error("failed to load lora adapter file from " + std::string(path_lora));
161
+ }
162
+
163
+ lm_ggml_context_ptr ctx { ctx_init };
164
+
165
+ // check metadata
166
+ {
167
+ auto get_kv_str = [&](const std::string & key) -> std::string {
168
+ int id = lm_gguf_find_key(ctx_gguf.get(), key.c_str());
169
+ return id < 0 ? "" : std::string(lm_gguf_get_val_str(ctx_gguf.get(), id));
170
+ };
171
+ auto get_kv_f32 = [&](const std::string & key) -> float {
172
+ int id = lm_gguf_find_key(ctx_gguf.get(), key.c_str());
173
+ return id < 0 ? 0.0f : lm_gguf_get_val_f32(ctx_gguf.get(), id);
174
+ };
175
+ LLM_KV llm_kv = LLM_KV(LLM_ARCH_UNKNOWN);
176
+
177
+ auto general_type = get_kv_str(llm_kv(LLM_KV_GENERAL_TYPE));
178
+ if (general_type != "adapter") {
179
+ throw std::runtime_error("expect general.type to be 'adapter', but got: " + general_type);
180
+ }
181
+
182
+ auto general_arch_str = get_kv_str(llm_kv(LLM_KV_GENERAL_ARCHITECTURE));
183
+ auto general_arch = llm_arch_from_string(general_arch_str);
184
+ if (general_arch != model.arch) {
185
+ throw std::runtime_error("model arch and LoRA arch mismatch");
186
+ }
187
+
188
+ auto adapter_type = get_kv_str(llm_kv(LLM_KV_ADAPTER_TYPE));
189
+ if (adapter_type != "lora") {
190
+ throw std::runtime_error("expect adapter.type to be 'lora', but got: " + adapter_type);
191
+ }
192
+
193
+ adapter.alpha = get_kv_f32(llm_kv(LLM_KV_ADAPTER_LORA_ALPHA));
194
+ }
195
+
196
+ int n_tensors = lm_gguf_get_n_tensors(ctx_gguf.get());
197
+
198
+ // contexts for each buffer type
199
+ std::map<lm_ggml_backend_buffer_type_t, lm_ggml_context *> ctx_map;
200
+ auto ctx_for_buft = [&](lm_ggml_backend_buffer_type_t buft) -> lm_ggml_context * {
201
+ auto it = ctx_map.find(buft);
202
+ if (it == ctx_map.end()) {
203
+ // add a new context
204
+ struct lm_ggml_init_params params = {
205
+ /*.mem_size =*/ n_tensors*lm_ggml_tensor_overhead(),
206
+ /*.mem_buffer =*/ NULL,
207
+ /*.no_alloc =*/ true,
208
+ };
209
+ lm_ggml_context * buft_ctx = lm_ggml_init(params);
210
+ if (!buft_ctx) {
211
+ return nullptr;
212
+ }
213
+ ctx_map[buft] = buft_ctx;
214
+ adapter.ctxs.emplace_back(buft_ctx);
215
+ return buft_ctx;
216
+ };
217
+ return it->second;
218
+ };
219
+
220
+ // bundle lora_a and lora_b into pairs
221
+ std::map<std::string, llama_adapter_lora_weight> ab_map;
222
+ auto str_endswith = [](const std::string & str, const std::string & suffix) {
223
+ return str.size() >= suffix.size() && str.compare(str.size()-suffix.size(), suffix.size(), suffix) == 0;
224
+ };
225
+
226
+ for (lm_ggml_tensor * cur = lm_ggml_get_first_tensor(ctx.get()); cur; cur = lm_ggml_get_next_tensor(ctx.get(), cur)) {
227
+ std::string name(cur->name);
228
+ if (str_endswith(name, ".lora_a")) {
229
+ replace_all(name, ".lora_a", "");
230
+ if (ab_map.find(name) == ab_map.end()) {
231
+ ab_map[name] = llama_adapter_lora_weight(cur, nullptr);
232
+ } else {
233
+ ab_map[name].a = cur;
234
+ }
235
+ } else if (str_endswith(name, ".lora_b")) {
236
+ replace_all(name, ".lora_b", "");
237
+ if (ab_map.find(name) == ab_map.end()) {
238
+ ab_map[name] = llama_adapter_lora_weight(nullptr, cur);
239
+ } else {
240
+ ab_map[name].b = cur;
241
+ }
242
+ } else if (str_endswith(name, "_norm.weight")) {
243
+ // TODO: add support for norm vector
244
+ // for now, we don't really care because most adapters still work fine without it
245
+ continue;
246
+ } else {
247
+ throw std::runtime_error("LoRA tensor '" + name + "' has unexpected suffix");
248
+ }
249
+ }
250
+
251
+ // add tensors
252
+ for (auto & it : ab_map) {
253
+ const std::string & name = it.first;
254
+ llama_adapter_lora_weight & w = it.second;
255
+ bool is_token_embd = str_endswith(name, "token_embd.weight");
256
+
257
+ if (!w.a || !w.b) {
258
+ throw std::runtime_error("LoRA tensor pair for '" + name + "' is missing one component");
259
+ }
260
+
261
+ // device buft and device ctx
262
+ const auto * model_tensor = model.get_tensor(name.c_str());
263
+ if (!model_tensor) {
264
+ throw std::runtime_error("LoRA tensor '" + name + "' does not exist in base model (hint: maybe wrong base model?)");
265
+ }
266
+
267
+ struct lm_ggml_context * dev_ctx = ctx_for_buft(lm_ggml_backend_buffer_get_type(model_tensor->buffer));
268
+ // validate tensor shape
269
+ if (is_token_embd) {
270
+ // expect B to be non-transposed, A and B are flipped; see llm_build_inp_embd()
271
+ if (model_tensor->ne[0] != w.b->ne[1] || model_tensor->ne[1] != w.a->ne[1]) {
272
+ throw std::runtime_error("tensor '" + name + "' has incorrect shape (hint: maybe wrong base model?)");
273
+ }
274
+ } else {
275
+ if (model_tensor->ne[0] != w.a->ne[0] || model_tensor->ne[1] != w.b->ne[1]) {
276
+ throw std::runtime_error("tensor '" + name + "' has incorrect shape (hint: maybe wrong base model?)");
277
+ }
278
+ if (w.a->ne[1] != w.b->ne[0]) {
279
+ throw std::runtime_error("lora_a tensor is not transposed (hint: adapter from \"finetune\" example is no longer supported)");
280
+ }
281
+ }
282
+
283
+ // save tensor to adapter
284
+ struct lm_ggml_tensor * tensor_a = lm_ggml_dup_tensor(dev_ctx, w.a);
285
+ struct lm_ggml_tensor * tensor_b = lm_ggml_dup_tensor(dev_ctx, w.b);
286
+ lm_ggml_set_name(tensor_a, w.a->name);
287
+ lm_ggml_set_name(tensor_b, w.b->name);
288
+ adapter.ab_map[name] = llama_adapter_lora_weight(tensor_a, tensor_b);
289
+ }
290
+
291
+ // allocate tensors / buffers and zero
292
+ {
293
+ adapter.ctxs.reserve(ctx_map.size());
294
+ adapter.bufs.reserve(ctx_map.size());
295
+ for (auto & it : ctx_map) {
296
+ lm_ggml_backend_buffer_type_t buft = it.first;
297
+ lm_ggml_context * ctx_dev = it.second;
298
+ lm_ggml_backend_buffer_ptr buf { lm_ggml_backend_alloc_ctx_tensors_from_buft(ctx_dev, buft) };
299
+ if (!buf) {
300
+ throw std::runtime_error("failed to allocate buffer for lora adapter\n");
301
+ }
302
+ LLAMA_LOG_INFO("%s: %10s LoRA buffer size = %8.2f MiB\n", __func__, lm_ggml_backend_buffer_name(buf.get()), lm_ggml_backend_buffer_get_size(buf.get())/1024.0/1024.0);
303
+ adapter.bufs.emplace_back(std::move(buf));
304
+ }
305
+ }
306
+
307
+ // set tensor data
308
+ {
309
+ llama_file lm_gguf_file(path_lora, "rb");
310
+ std::vector<uint8_t> read_buf;
311
+ auto set_tensor = [&](struct lm_ggml_tensor * orig, struct lm_ggml_tensor * dev) {
312
+ size_t offs = lm_gguf_get_data_offset(ctx_gguf.get()) + lm_gguf_get_tensor_offset(ctx_gguf.get(), lm_gguf_find_tensor(ctx_gguf.get(), orig->name));
313
+ size_t size = lm_ggml_nbytes(orig);
314
+ read_buf.resize(size);
315
+ lm_gguf_file.seek(offs, SEEK_SET);
316
+ lm_gguf_file.read_raw(read_buf.data(), size);
317
+ lm_ggml_backend_tensor_set(dev, read_buf.data(), 0, size);
318
+ };
319
+ for (auto & it : adapter.ab_map) {
320
+ auto orig = ab_map[it.first];
321
+ auto dev = it.second;
322
+ set_tensor(orig.a, dev.a);
323
+ set_tensor(orig.b, dev.b);
324
+ }
325
+ }
326
+
327
+ LLAMA_LOG_INFO("%s: loaded %zu tensors from lora file\n", __func__, adapter.ab_map.size()*2);
328
+ }
329
+
330
+ struct llama_adapter_lora * llama_adapter_lora_init(struct llama_model * model, const char * path_lora) {
331
+ struct llama_adapter_lora * adapter = new llama_adapter_lora();
332
+
333
+ try {
334
+ llama_adapter_lora_init_impl(*model, path_lora, *adapter);
335
+ return adapter;
336
+ } catch (const std::exception & err) {
337
+ LLAMA_LOG_ERROR("%s: failed to apply lora adapter: %s\n", __func__, err.what());
338
+
339
+ delete adapter;
340
+ }
341
+
342
+ return nullptr;
343
+ }
344
+
345
+ void llama_adapter_lora_free(struct llama_adapter_lora * adapter) {
346
+ delete adapter;
347
+ }