cui-llama.rn 1.4.6 → 1.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (80) hide show
  1. package/android/src/main/CMakeLists.txt +9 -2
  2. package/android/src/main/jni.cpp +52 -34
  3. package/android/src/main/jniLibs/arm64-v8a/librnllama.so +0 -0
  4. package/android/src/main/jniLibs/arm64-v8a/librnllama_v8.so +0 -0
  5. package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2.so +0 -0
  6. package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2_dotprod.so +0 -0
  7. package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2_dotprod_i8mm.so +0 -0
  8. package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2_i8mm.so +0 -0
  9. package/android/src/main/jniLibs/x86_64/librnllama.so +0 -0
  10. package/android/src/main/jniLibs/x86_64/librnllama_x86_64.so +0 -0
  11. package/cpp/binary-ops.cpp +158 -0
  12. package/cpp/binary-ops.h +16 -0
  13. package/cpp/chat.cpp +1769 -1779
  14. package/cpp/chat.h +9 -1
  15. package/cpp/common.cpp +20 -522
  16. package/cpp/common.h +13 -36
  17. package/cpp/cpu-common.h +72 -0
  18. package/cpp/ggml-common.h +12 -6
  19. package/cpp/ggml-cpu-aarch64.cpp +1557 -80
  20. package/cpp/ggml-cpu-impl.h +2 -21
  21. package/cpp/ggml-cpu-quants.c +904 -405
  22. package/cpp/ggml-cpu.c +909 -13237
  23. package/cpp/ggml-impl.h +50 -23
  24. package/cpp/ggml-metal-impl.h +77 -3
  25. package/cpp/ggml-metal.m +794 -580
  26. package/cpp/ggml.c +92 -3
  27. package/cpp/ggml.h +29 -5
  28. package/cpp/gguf.cpp +1 -0
  29. package/cpp/llama-adapter.cpp +55 -20
  30. package/cpp/llama-adapter.h +11 -9
  31. package/cpp/llama-arch.cpp +217 -16
  32. package/cpp/llama-arch.h +25 -0
  33. package/cpp/llama-batch.h +2 -2
  34. package/cpp/llama-chat.cpp +54 -2
  35. package/cpp/llama-chat.h +3 -0
  36. package/cpp/llama-context.cpp +2294 -1238
  37. package/cpp/llama-context.h +214 -77
  38. package/cpp/llama-cparams.h +1 -0
  39. package/cpp/llama-graph.cpp +1695 -0
  40. package/cpp/llama-graph.h +592 -0
  41. package/cpp/llama-hparams.cpp +8 -0
  42. package/cpp/llama-hparams.h +17 -0
  43. package/cpp/llama-io.cpp +15 -0
  44. package/cpp/llama-io.h +35 -0
  45. package/cpp/llama-kv-cache.cpp +965 -303
  46. package/cpp/llama-kv-cache.h +145 -151
  47. package/cpp/llama-memory.cpp +1 -0
  48. package/cpp/llama-memory.h +21 -0
  49. package/cpp/llama-mmap.cpp +1 -1
  50. package/cpp/llama-model-loader.cpp +10 -5
  51. package/cpp/llama-model-loader.h +5 -3
  52. package/cpp/llama-model.cpp +9194 -201
  53. package/cpp/llama-model.h +40 -1
  54. package/cpp/llama-sampling.cpp +5 -0
  55. package/cpp/llama-vocab.cpp +36 -5
  56. package/cpp/llama.cpp +51 -9984
  57. package/cpp/llama.h +102 -22
  58. package/cpp/log.cpp +34 -0
  59. package/cpp/minja/chat-template.hpp +15 -7
  60. package/cpp/minja/minja.hpp +120 -94
  61. package/cpp/ops.cpp +8723 -0
  62. package/cpp/ops.h +128 -0
  63. package/cpp/rn-llama.cpp +44 -53
  64. package/cpp/rn-llama.h +2 -12
  65. package/cpp/sampling.cpp +3 -0
  66. package/cpp/sgemm.cpp +533 -88
  67. package/cpp/simd-mappings.h +888 -0
  68. package/cpp/speculative.cpp +4 -4
  69. package/cpp/unary-ops.cpp +186 -0
  70. package/cpp/unary-ops.h +28 -0
  71. package/cpp/vec.cpp +258 -0
  72. package/cpp/vec.h +802 -0
  73. package/ios/CMakeLists.txt +5 -2
  74. package/ios/RNLlama.mm +2 -2
  75. package/ios/RNLlamaContext.mm +40 -24
  76. package/package.json +1 -1
  77. package/src/NativeRNLlama.ts +6 -4
  78. package/src/index.ts +3 -1
  79. package/cpp/chat-template.hpp +0 -529
  80. package/cpp/minja.hpp +0 -2915
package/cpp/ggml.c CHANGED
@@ -942,6 +942,7 @@ static const char * LM_GGML_OP_NAME[LM_GGML_OP_COUNT] = {
942
942
  "RMS_NORM",
943
943
  "RMS_NORM_BACK",
944
944
  "GROUP_NORM",
945
+ "L2_NORM",
945
946
 
946
947
  "MUL_MAT",
947
948
  "MUL_MAT_ID",
@@ -990,6 +991,7 @@ static const char * LM_GGML_OP_NAME[LM_GGML_OP_COUNT] = {
990
991
  "ADD_REL_POS",
991
992
  "RWKV_WKV6",
992
993
  "GATED_LINEAR_ATTN",
994
+ "RWKV_WKV7",
993
995
 
994
996
  "UNARY",
995
997
 
@@ -1009,7 +1011,7 @@ static const char * LM_GGML_OP_NAME[LM_GGML_OP_COUNT] = {
1009
1011
  "OPT_STEP_ADAMW",
1010
1012
  };
1011
1013
 
1012
- static_assert(LM_GGML_OP_COUNT == 83, "LM_GGML_OP_COUNT != 83");
1014
+ static_assert(LM_GGML_OP_COUNT == 85, "LM_GGML_OP_COUNT != 85");
1013
1015
 
1014
1016
  static const char * LM_GGML_OP_SYMBOL[LM_GGML_OP_COUNT] = {
1015
1017
  "none",
@@ -1039,6 +1041,7 @@ static const char * LM_GGML_OP_SYMBOL[LM_GGML_OP_COUNT] = {
1039
1041
  "rms_norm(x)",
1040
1042
  "rms_norm_back(x)",
1041
1043
  "group_norm(x)",
1044
+ "l2_norm(x)",
1042
1045
 
1043
1046
  "X*Y",
1044
1047
  "X[i]*Y",
@@ -1087,6 +1090,7 @@ static const char * LM_GGML_OP_SYMBOL[LM_GGML_OP_COUNT] = {
1087
1090
  "add_rel_pos(x)",
1088
1091
  "rwkv_wkv6(k, v, r, tf, td, s)",
1089
1092
  "gated_linear_attn(k, v, q, gate, s)",
1093
+ "rwkv_wkv7(r, w, k, v, a, b, s)",
1090
1094
 
1091
1095
  "unary(x)",
1092
1096
 
@@ -1106,7 +1110,7 @@ static const char * LM_GGML_OP_SYMBOL[LM_GGML_OP_COUNT] = {
1106
1110
  "adamw(x)",
1107
1111
  };
1108
1112
 
1109
- static_assert(LM_GGML_OP_COUNT == 83, "LM_GGML_OP_COUNT != 83");
1113
+ static_assert(LM_GGML_OP_COUNT == 85, "LM_GGML_OP_COUNT != 85");
1110
1114
 
1111
1115
  static_assert(LM_GGML_OP_POOL_COUNT == 2, "LM_GGML_OP_POOL_COUNT != 2");
1112
1116
 
@@ -1168,6 +1172,12 @@ int64_t lm_ggml_nrows(const struct lm_ggml_tensor * tensor) {
1168
1172
  }
1169
1173
 
1170
1174
  size_t lm_ggml_nbytes(const struct lm_ggml_tensor * tensor) {
1175
+ for (int i = 0; i < LM_GGML_MAX_DIMS; ++i) {
1176
+ if (tensor->ne[i] <= 0) {
1177
+ return 0;
1178
+ }
1179
+ }
1180
+
1171
1181
  size_t nbytes;
1172
1182
  const size_t blck_size = lm_ggml_blck_size(tensor->type);
1173
1183
  if (blck_size == 1) {
@@ -2699,6 +2709,37 @@ struct lm_ggml_tensor * lm_ggml_group_norm_inplace(
2699
2709
  return lm_ggml_group_norm_impl(ctx, a, n_groups, eps, true);
2700
2710
  }
2701
2711
 
2712
+ // lm_ggml_l2_norm
2713
+
2714
+ static struct lm_ggml_tensor * lm_ggml_l2_norm_impl(
2715
+ struct lm_ggml_context * ctx,
2716
+ struct lm_ggml_tensor * a,
2717
+ float eps,
2718
+ bool inplace) {
2719
+ struct lm_ggml_tensor * result = inplace ? lm_ggml_view_tensor(ctx, a) : lm_ggml_dup_tensor(ctx, a);
2720
+
2721
+ lm_ggml_set_op_params_f32(result, 0, eps);
2722
+
2723
+ result->op = LM_GGML_OP_L2_NORM;
2724
+ result->src[0] = a;
2725
+
2726
+ return result;
2727
+ }
2728
+
2729
+ struct lm_ggml_tensor * lm_ggml_l2_norm(
2730
+ struct lm_ggml_context * ctx,
2731
+ struct lm_ggml_tensor * a,
2732
+ float eps) {
2733
+ return lm_ggml_l2_norm_impl(ctx, a, eps, false);
2734
+ }
2735
+
2736
+ struct lm_ggml_tensor * lm_ggml_l2_norm_inplace(
2737
+ struct lm_ggml_context * ctx,
2738
+ struct lm_ggml_tensor * a,
2739
+ float eps) {
2740
+ return lm_ggml_l2_norm_impl(ctx, a, eps, true);
2741
+ }
2742
+
2702
2743
  // lm_ggml_mul_mat
2703
2744
 
2704
2745
  static inline bool lm_ggml_can_mul_mat(const struct lm_ggml_tensor * t0, const struct lm_ggml_tensor * t1) {
@@ -4347,7 +4388,7 @@ struct lm_ggml_tensor * lm_ggml_flash_attn_ext(
4347
4388
  }
4348
4389
 
4349
4390
  // permute(0, 2, 1, 3)
4350
- int64_t ne[4] = { q->ne[0], q->ne[2], q->ne[1], q->ne[3] };
4391
+ int64_t ne[4] = { v->ne[0], q->ne[2], q->ne[1], q->ne[3] };
4351
4392
  struct lm_ggml_tensor * result = lm_ggml_new_tensor(ctx, LM_GGML_TYPE_F32, 4, ne);
4352
4393
 
4353
4394
  float params[] = { scale, max_bias, logit_softcap };
@@ -4733,6 +4774,54 @@ struct lm_ggml_tensor * lm_ggml_gated_linear_attn(
4733
4774
  return result;
4734
4775
  }
4735
4776
 
4777
+ // lm_ggml_rwkv_wkv7
4778
+
4779
+ struct lm_ggml_tensor * lm_ggml_rwkv_wkv7(
4780
+ struct lm_ggml_context * ctx,
4781
+ struct lm_ggml_tensor * r,
4782
+ struct lm_ggml_tensor * w,
4783
+ struct lm_ggml_tensor * k,
4784
+ struct lm_ggml_tensor * v,
4785
+ struct lm_ggml_tensor * a,
4786
+ struct lm_ggml_tensor * b,
4787
+ struct lm_ggml_tensor * state) {
4788
+ LM_GGML_ASSERT(lm_ggml_is_contiguous(r));
4789
+ LM_GGML_ASSERT(lm_ggml_is_contiguous(w));
4790
+ LM_GGML_ASSERT(lm_ggml_is_contiguous(k));
4791
+ LM_GGML_ASSERT(lm_ggml_is_contiguous(v));
4792
+ LM_GGML_ASSERT(lm_ggml_is_contiguous(a));
4793
+ LM_GGML_ASSERT(lm_ggml_is_contiguous(b));
4794
+ LM_GGML_ASSERT(lm_ggml_is_contiguous(state));
4795
+
4796
+ const int64_t S = k->ne[0];
4797
+ const int64_t H = k->ne[1];
4798
+ const int64_t n_tokens = k->ne[2];
4799
+ const int64_t n_seqs = state->ne[1];
4800
+ {
4801
+ LM_GGML_ASSERT(w->ne[0] == S && w->ne[1] == H && w->ne[2] == n_tokens);
4802
+ LM_GGML_ASSERT(k->ne[0] == S && k->ne[1] == H && k->ne[2] == n_tokens);
4803
+ LM_GGML_ASSERT(v->ne[0] == S && v->ne[1] == H && v->ne[2] == n_tokens);
4804
+ LM_GGML_ASSERT(a->ne[0] == S && a->ne[1] == H && a->ne[2] == n_tokens);
4805
+ LM_GGML_ASSERT(b->ne[0] == S && b->ne[1] == H && b->ne[2] == n_tokens);
4806
+ LM_GGML_ASSERT(lm_ggml_nelements(state) == S * S * H * n_seqs);
4807
+ }
4808
+
4809
+ // concat output and new_state
4810
+ const int64_t ne[4] = { S * H, n_tokens + S * n_seqs, 1, 1 };
4811
+ struct lm_ggml_tensor * result = lm_ggml_new_tensor(ctx, LM_GGML_TYPE_F32, 4, ne);
4812
+
4813
+ result->op = LM_GGML_OP_RWKV_WKV7;
4814
+ result->src[0] = r;
4815
+ result->src[1] = w;
4816
+ result->src[2] = k;
4817
+ result->src[3] = v;
4818
+ result->src[4] = a;
4819
+ result->src[5] = b;
4820
+ result->src[6] = state;
4821
+
4822
+ return result;
4823
+ }
4824
+
4736
4825
  // lm_ggml_unary
4737
4826
 
4738
4827
  static struct lm_ggml_tensor * lm_ggml_unary_impl(
package/cpp/ggml.h CHANGED
@@ -455,6 +455,7 @@ extern "C" {
455
455
  LM_GGML_OP_RMS_NORM,
456
456
  LM_GGML_OP_RMS_NORM_BACK,
457
457
  LM_GGML_OP_GROUP_NORM,
458
+ LM_GGML_OP_L2_NORM,
458
459
 
459
460
  LM_GGML_OP_MUL_MAT,
460
461
  LM_GGML_OP_MUL_MAT_ID,
@@ -503,6 +504,7 @@ extern "C" {
503
504
  LM_GGML_OP_ADD_REL_POS,
504
505
  LM_GGML_OP_RWKV_WKV6,
505
506
  LM_GGML_OP_GATED_LINEAR_ATTN,
507
+ LM_GGML_OP_RWKV_WKV7,
506
508
 
507
509
  LM_GGML_OP_UNARY,
508
510
 
@@ -1096,6 +1098,18 @@ extern "C" {
1096
1098
  int n_groups,
1097
1099
  float eps);
1098
1100
 
1101
+ // l2 normalize along rows
1102
+ // used in rwkv v7
1103
+ LM_GGML_API struct lm_ggml_tensor * lm_ggml_l2_norm(
1104
+ struct lm_ggml_context * ctx,
1105
+ struct lm_ggml_tensor * a,
1106
+ float eps);
1107
+
1108
+ LM_GGML_API struct lm_ggml_tensor * lm_ggml_l2_norm_inplace(
1109
+ struct lm_ggml_context * ctx,
1110
+ struct lm_ggml_tensor * a,
1111
+ float eps);
1112
+
1099
1113
  // a - x
1100
1114
  // b - dy
1101
1115
  LM_GGML_API struct lm_ggml_tensor * lm_ggml_rms_norm_back(
@@ -1778,11 +1792,11 @@ extern "C" {
1778
1792
 
1779
1793
  #define LM_GGML_KQ_MASK_PAD 64
1780
1794
 
1781
- // q: [n_embd, n_batch, n_head, 1]
1782
- // k: [n_embd, n_kv, n_head_kv, 1]
1783
- // v: [n_embd, n_kv, n_head_kv, 1] !! not transposed !!
1784
- // mask: [n_kv, n_batch_pad, 1, 1] !! n_batch_pad = LM_GGML_PAD(n_batch, LM_GGML_KQ_MASK_PAD) !!
1785
- // res: [n_embd, n_head, n_batch, 1] !! permuted !!
1795
+ // q: [n_embd_k, n_batch, n_head, 1]
1796
+ // k: [n_embd_k, n_kv, n_head_kv, 1]
1797
+ // v: [n_embd_v, n_kv, n_head_kv, 1] !! not transposed !!
1798
+ // mask: [n_kv, n_batch_pad, 1, 1] !! n_batch_pad = LM_GGML_PAD(n_batch, LM_GGML_KQ_MASK_PAD) !!
1799
+ // res: [n_embd_v, n_head, n_batch, 1] !! permuted !!
1786
1800
  LM_GGML_API struct lm_ggml_tensor * lm_ggml_flash_attn_ext(
1787
1801
  struct lm_ggml_context * ctx,
1788
1802
  struct lm_ggml_tensor * q,
@@ -1891,6 +1905,16 @@ extern "C" {
1891
1905
  struct lm_ggml_tensor * state,
1892
1906
  float scale);
1893
1907
 
1908
+ LM_GGML_API struct lm_ggml_tensor * lm_ggml_rwkv_wkv7(
1909
+ struct lm_ggml_context * ctx,
1910
+ struct lm_ggml_tensor * r,
1911
+ struct lm_ggml_tensor * w,
1912
+ struct lm_ggml_tensor * k,
1913
+ struct lm_ggml_tensor * v,
1914
+ struct lm_ggml_tensor * a,
1915
+ struct lm_ggml_tensor * b,
1916
+ struct lm_ggml_tensor * state);
1917
+
1894
1918
  // custom operators
1895
1919
 
1896
1920
  typedef void (*lm_ggml_unary_op_f32_t) (const int, float *, const float *);
package/cpp/gguf.cpp CHANGED
@@ -932,6 +932,7 @@ static void lm_gguf_check_reserved_keys(const std::string & key, const T val) {
932
932
  if constexpr (std::is_same<T, uint32_t>::value) {
933
933
  LM_GGML_ASSERT(val > 0 && (val & (val - 1)) == 0 && LM_GGUF_KEY_GENERAL_ALIGNMENT " must be power of 2");
934
934
  } else {
935
+ LM_GGML_UNUSED(val);
935
936
  LM_GGML_ABORT(LM_GGUF_KEY_GENERAL_ALIGNMENT " must be type u32");
936
937
  }
937
938
  }
@@ -4,14 +4,13 @@
4
4
  #include "llama-mmap.h"
5
5
  #include "llama-model.h"
6
6
 
7
- #include <algorithm>
8
7
  #include <map>
9
8
  #include <cassert>
10
9
  #include <stdexcept>
11
10
 
12
11
  // vec
13
12
 
14
- struct lm_ggml_tensor * llama_adapter_cvec::tensor_for(int il) const {
13
+ lm_ggml_tensor * llama_adapter_cvec::tensor_for(int il) const {
15
14
  if (il < 0 || il < layer_start || il > layer_end || (size_t) il >= tensors.size()) {
16
15
  return nullptr;
17
16
  }
@@ -19,7 +18,7 @@ struct lm_ggml_tensor * llama_adapter_cvec::tensor_for(int il) const {
19
18
  return tensors[il];
20
19
  }
21
20
 
22
- struct lm_ggml_tensor * llama_adapter_cvec::apply_to(struct lm_ggml_context * ctx, struct lm_ggml_tensor * cur, int il) const {
21
+ lm_ggml_tensor * llama_adapter_cvec::apply_to(lm_ggml_context * ctx, lm_ggml_tensor * cur, int il) const {
23
22
  lm_ggml_tensor * layer_dir = tensor_for(il);
24
23
  if (layer_dir != nullptr) {
25
24
  cur = lm_ggml_add(ctx, cur, layer_dir);
@@ -40,7 +39,7 @@ bool llama_adapter_cvec::init(const llama_model & model) {
40
39
  auto ctx_for_buft = [&](lm_ggml_backend_buffer_type_t buft) -> lm_ggml_context * {
41
40
  auto it = ctx_map.find(buft);
42
41
  if (it == ctx_map.end()) {
43
- struct lm_ggml_init_params params = {
42
+ lm_ggml_init_params params = {
44
43
  /*.mem_size =*/ hparams.n_layer*lm_ggml_tensor_overhead(),
45
44
  /*.mem_buffer =*/ NULL,
46
45
  /*.no_alloc =*/ true,
@@ -91,7 +90,7 @@ bool llama_adapter_cvec::init(const llama_model & model) {
91
90
  return true;
92
91
  }
93
92
 
94
- int32_t llama_adapter_cvec::apply(
93
+ bool llama_adapter_cvec::apply(
95
94
  const llama_model & model,
96
95
  const float * data,
97
96
  size_t len,
@@ -104,17 +103,17 @@ int32_t llama_adapter_cvec::apply(
104
103
  // disable the current control vector (but leave allocated for later)
105
104
  layer_start = -1;
106
105
  layer_end = -1;
107
- return 0;
106
+ return true;
108
107
  }
109
108
 
110
109
  if (n_embd != (int) hparams.n_embd) {
111
110
  LLAMA_LOG_ERROR("%s: control vector n_embd does not match model\n", __func__);
112
- return 1;
111
+ return false;
113
112
  }
114
113
 
115
114
  if (tensors.empty()) {
116
115
  if (!init(model)) {
117
- return 1;
116
+ return false;
118
117
  }
119
118
  }
120
119
 
@@ -130,12 +129,12 @@ int32_t llama_adapter_cvec::apply(
130
129
  }
131
130
  }
132
131
 
133
- return 0;
132
+ return true;
134
133
  }
135
134
 
136
135
  // lora
137
136
 
138
- llama_adapter_lora_weight * llama_adapter_lora::get_weight(struct lm_ggml_tensor * w) {
137
+ llama_adapter_lora_weight * llama_adapter_lora::get_weight(lm_ggml_tensor * w) {
139
138
  const std::string name(w->name);
140
139
 
141
140
  const auto pos = ab_map.find(name);
@@ -146,11 +145,11 @@ llama_adapter_lora_weight * llama_adapter_lora::get_weight(struct lm_ggml_tensor
146
145
  return nullptr;
147
146
  }
148
147
 
149
- static void llama_adapter_lora_init_impl(struct llama_model & model, const char * path_lora, struct llama_adapter_lora & adapter) {
148
+ static void llama_adapter_lora_init_impl(llama_model & model, const char * path_lora, llama_adapter_lora & adapter) {
150
149
  LLAMA_LOG_INFO("%s: loading lora adapter from '%s' ...\n", __func__, path_lora);
151
150
 
152
151
  lm_ggml_context * ctx_init;
153
- struct lm_gguf_init_params meta_lm_gguf_params = {
152
+ lm_gguf_init_params meta_lm_gguf_params = {
154
153
  /* .no_alloc = */ true,
155
154
  /* .ctx = */ &ctx_init,
156
155
  };
@@ -201,7 +200,7 @@ static void llama_adapter_lora_init_impl(struct llama_model & model, const char
201
200
  auto it = ctx_map.find(buft);
202
201
  if (it == ctx_map.end()) {
203
202
  // add a new context
204
- struct lm_ggml_init_params params = {
203
+ lm_ggml_init_params params = {
205
204
  /*.mem_size =*/ n_tensors*lm_ggml_tensor_overhead(),
206
205
  /*.mem_buffer =*/ NULL,
207
206
  /*.no_alloc =*/ true,
@@ -248,6 +247,26 @@ static void llama_adapter_lora_init_impl(struct llama_model & model, const char
248
247
  }
249
248
  }
250
249
 
250
+ // get extra buffer types of the CPU
251
+ // TODO: a more general solution for non-CPU extra buft should be imlpemented in the future
252
+ // ref: https://github.com/ggml-org/llama.cpp/pull/12593#pullrequestreview-2718659948
253
+ std::vector<lm_ggml_backend_buffer_type_t> buft_extra;
254
+ {
255
+ auto * cpu_dev = lm_ggml_backend_dev_by_type(LM_GGML_BACKEND_DEVICE_TYPE_CPU);
256
+ auto * cpu_reg = lm_ggml_backend_dev_backend_reg(cpu_dev);
257
+
258
+ auto lm_ggml_backend_dev_get_extra_bufts_fn = (lm_ggml_backend_dev_get_extra_bufts_t)
259
+ lm_ggml_backend_reg_get_proc_address(cpu_reg, "lm_ggml_backend_dev_get_extra_bufts");
260
+
261
+ if (lm_ggml_backend_dev_get_extra_bufts_fn) {
262
+ lm_ggml_backend_buffer_type_t * extra_bufts = lm_ggml_backend_dev_get_extra_bufts_fn(cpu_dev);
263
+ while (extra_bufts && *extra_bufts) {
264
+ buft_extra.emplace_back(*extra_bufts);
265
+ ++extra_bufts;
266
+ }
267
+ }
268
+ }
269
+
251
270
  // add tensors
252
271
  for (auto & it : ab_map) {
253
272
  const std::string & name = it.first;
@@ -264,7 +283,23 @@ static void llama_adapter_lora_init_impl(struct llama_model & model, const char
264
283
  throw std::runtime_error("LoRA tensor '" + name + "' does not exist in base model (hint: maybe wrong base model?)");
265
284
  }
266
285
 
267
- struct lm_ggml_context * dev_ctx = ctx_for_buft(lm_ggml_backend_buffer_get_type(model_tensor->buffer));
286
+ auto * buft = lm_ggml_backend_buffer_get_type(model_tensor->buffer);
287
+
288
+ // do not load loras to extra buffer types (i.e. bufts for repacking) -> use the CPU in that case
289
+ for (auto & ex : buft_extra) {
290
+ if (ex == buft) {
291
+ LLAMA_LOG_WARN("%s: lora for '%s' cannot use buft '%s', fallback to CPU\n", __func__, model_tensor->name, lm_ggml_backend_buft_name(buft));
292
+
293
+ auto * cpu_dev = lm_ggml_backend_dev_by_type(LM_GGML_BACKEND_DEVICE_TYPE_CPU);
294
+ buft = lm_ggml_backend_dev_buffer_type(cpu_dev);
295
+
296
+ break;
297
+ }
298
+ }
299
+
300
+ LLAMA_LOG_DEBUG("%s: lora for '%s' -> '%s'\n", __func__, model_tensor->name, lm_ggml_backend_buft_name(buft));
301
+
302
+ lm_ggml_context * dev_ctx = ctx_for_buft(buft);
268
303
  // validate tensor shape
269
304
  if (is_token_embd) {
270
305
  // expect B to be non-transposed, A and B are flipped; see llm_build_inp_embd()
@@ -281,8 +316,8 @@ static void llama_adapter_lora_init_impl(struct llama_model & model, const char
281
316
  }
282
317
 
283
318
  // save tensor to adapter
284
- struct lm_ggml_tensor * tensor_a = lm_ggml_dup_tensor(dev_ctx, w.a);
285
- struct lm_ggml_tensor * tensor_b = lm_ggml_dup_tensor(dev_ctx, w.b);
319
+ lm_ggml_tensor * tensor_a = lm_ggml_dup_tensor(dev_ctx, w.a);
320
+ lm_ggml_tensor * tensor_b = lm_ggml_dup_tensor(dev_ctx, w.b);
286
321
  lm_ggml_set_name(tensor_a, w.a->name);
287
322
  lm_ggml_set_name(tensor_b, w.b->name);
288
323
  adapter.ab_map[name] = llama_adapter_lora_weight(tensor_a, tensor_b);
@@ -308,7 +343,7 @@ static void llama_adapter_lora_init_impl(struct llama_model & model, const char
308
343
  {
309
344
  llama_file lm_gguf_file(path_lora, "rb");
310
345
  std::vector<uint8_t> read_buf;
311
- auto set_tensor = [&](struct lm_ggml_tensor * orig, struct lm_ggml_tensor * dev) {
346
+ auto set_tensor = [&](lm_ggml_tensor * orig, lm_ggml_tensor * dev) {
312
347
  size_t offs = lm_gguf_get_data_offset(ctx_gguf.get()) + lm_gguf_get_tensor_offset(ctx_gguf.get(), lm_gguf_find_tensor(ctx_gguf.get(), orig->name));
313
348
  size_t size = lm_ggml_nbytes(orig);
314
349
  read_buf.resize(size);
@@ -327,8 +362,8 @@ static void llama_adapter_lora_init_impl(struct llama_model & model, const char
327
362
  LLAMA_LOG_INFO("%s: loaded %zu tensors from lora file\n", __func__, adapter.ab_map.size()*2);
328
363
  }
329
364
 
330
- struct llama_adapter_lora * llama_adapter_lora_init(struct llama_model * model, const char * path_lora) {
331
- struct llama_adapter_lora * adapter = new llama_adapter_lora();
365
+ llama_adapter_lora * llama_adapter_lora_init(llama_model * model, const char * path_lora) {
366
+ llama_adapter_lora * adapter = new llama_adapter_lora();
332
367
 
333
368
  try {
334
369
  llama_adapter_lora_init_impl(*model, path_lora, *adapter);
@@ -342,6 +377,6 @@ struct llama_adapter_lora * llama_adapter_lora_init(struct llama_model * model,
342
377
  return nullptr;
343
378
  }
344
379
 
345
- void llama_adapter_lora_free(struct llama_adapter_lora * adapter) {
380
+ void llama_adapter_lora_free(llama_adapter_lora * adapter) {
346
381
  delete adapter;
347
382
  }
@@ -15,11 +15,11 @@
15
15
  //
16
16
 
17
17
  struct llama_adapter_cvec {
18
- struct lm_ggml_tensor * tensor_for(int il) const;
18
+ lm_ggml_tensor * tensor_for(int il) const;
19
19
 
20
- struct lm_ggml_tensor * apply_to(struct lm_ggml_context * ctx, struct lm_ggml_tensor * cur, int il) const;
20
+ lm_ggml_tensor * apply_to(lm_ggml_context * ctx, lm_ggml_tensor * cur, int il) const;
21
21
 
22
- int32_t apply(
22
+ bool apply(
23
23
  const llama_model & model,
24
24
  const float * data,
25
25
  size_t len,
@@ -36,7 +36,7 @@ private:
36
36
  std::vector<lm_ggml_context_ptr> ctxs;
37
37
  std::vector<lm_ggml_backend_buffer_ptr> bufs;
38
38
 
39
- std::vector<struct lm_ggml_tensor *> tensors; // per layer
39
+ std::vector<lm_ggml_tensor *> tensors; // per layer
40
40
  };
41
41
 
42
42
  //
@@ -44,8 +44,8 @@ private:
44
44
  //
45
45
 
46
46
  struct llama_adapter_lora_weight {
47
- struct lm_ggml_tensor * a = nullptr;
48
- struct lm_ggml_tensor * b = nullptr;
47
+ lm_ggml_tensor * a = nullptr;
48
+ lm_ggml_tensor * b = nullptr;
49
49
 
50
50
  // get actual scale based on rank and alpha
51
51
  float get_scale(float alpha, float adapter_scale) const {
@@ -55,12 +55,12 @@ struct llama_adapter_lora_weight {
55
55
  }
56
56
 
57
57
  llama_adapter_lora_weight() = default;
58
- llama_adapter_lora_weight(struct lm_ggml_tensor * a, struct lm_ggml_tensor * b) : a(a), b(b) {}
58
+ llama_adapter_lora_weight(lm_ggml_tensor * a, lm_ggml_tensor * b) : a(a), b(b) {}
59
59
  };
60
60
 
61
61
  struct llama_adapter_lora {
62
62
  // map tensor name to lora_a_b
63
- std::unordered_map<std::string, struct llama_adapter_lora_weight> ab_map;
63
+ std::unordered_map<std::string, llama_adapter_lora_weight> ab_map;
64
64
 
65
65
  std::vector<lm_ggml_context_ptr> ctxs;
66
66
  std::vector<lm_ggml_backend_buffer_ptr> bufs;
@@ -70,5 +70,7 @@ struct llama_adapter_lora {
70
70
  llama_adapter_lora() = default;
71
71
  ~llama_adapter_lora() = default;
72
72
 
73
- llama_adapter_lora_weight * get_weight(struct lm_ggml_tensor * w);
73
+ llama_adapter_lora_weight * get_weight(lm_ggml_tensor * w);
74
74
  };
75
+
76
+ using llama_adapter_loras = std::unordered_map<llama_adapter_lora *, float>;