cui-llama.rn 1.4.6 → 1.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/android/src/main/CMakeLists.txt +9 -2
- package/android/src/main/jni.cpp +52 -34
- package/android/src/main/jniLibs/arm64-v8a/librnllama.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/librnllama_v8.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2_dotprod.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2_dotprod_i8mm.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2_i8mm.so +0 -0
- package/android/src/main/jniLibs/x86_64/librnllama.so +0 -0
- package/android/src/main/jniLibs/x86_64/librnllama_x86_64.so +0 -0
- package/cpp/binary-ops.cpp +158 -0
- package/cpp/binary-ops.h +16 -0
- package/cpp/chat.cpp +1769 -1779
- package/cpp/chat.h +9 -1
- package/cpp/common.cpp +20 -522
- package/cpp/common.h +13 -36
- package/cpp/cpu-common.h +72 -0
- package/cpp/ggml-common.h +12 -6
- package/cpp/ggml-cpu-aarch64.cpp +1557 -80
- package/cpp/ggml-cpu-impl.h +2 -21
- package/cpp/ggml-cpu-quants.c +904 -405
- package/cpp/ggml-cpu.c +909 -13237
- package/cpp/ggml-impl.h +50 -23
- package/cpp/ggml-metal-impl.h +77 -3
- package/cpp/ggml-metal.m +794 -580
- package/cpp/ggml.c +92 -3
- package/cpp/ggml.h +29 -5
- package/cpp/gguf.cpp +1 -0
- package/cpp/llama-adapter.cpp +55 -20
- package/cpp/llama-adapter.h +11 -9
- package/cpp/llama-arch.cpp +217 -16
- package/cpp/llama-arch.h +25 -0
- package/cpp/llama-batch.h +2 -2
- package/cpp/llama-chat.cpp +54 -2
- package/cpp/llama-chat.h +3 -0
- package/cpp/llama-context.cpp +2294 -1238
- package/cpp/llama-context.h +214 -77
- package/cpp/llama-cparams.h +1 -0
- package/cpp/llama-graph.cpp +1695 -0
- package/cpp/llama-graph.h +592 -0
- package/cpp/llama-hparams.cpp +8 -0
- package/cpp/llama-hparams.h +17 -0
- package/cpp/llama-io.cpp +15 -0
- package/cpp/llama-io.h +35 -0
- package/cpp/llama-kv-cache.cpp +965 -303
- package/cpp/llama-kv-cache.h +145 -151
- package/cpp/llama-memory.cpp +1 -0
- package/cpp/llama-memory.h +21 -0
- package/cpp/llama-mmap.cpp +1 -1
- package/cpp/llama-model-loader.cpp +10 -5
- package/cpp/llama-model-loader.h +5 -3
- package/cpp/llama-model.cpp +9194 -201
- package/cpp/llama-model.h +40 -1
- package/cpp/llama-sampling.cpp +5 -0
- package/cpp/llama-vocab.cpp +36 -5
- package/cpp/llama.cpp +51 -9984
- package/cpp/llama.h +102 -22
- package/cpp/log.cpp +34 -0
- package/cpp/minja/chat-template.hpp +15 -7
- package/cpp/minja/minja.hpp +120 -94
- package/cpp/ops.cpp +8723 -0
- package/cpp/ops.h +128 -0
- package/cpp/rn-llama.cpp +44 -53
- package/cpp/rn-llama.h +2 -12
- package/cpp/sampling.cpp +3 -0
- package/cpp/sgemm.cpp +533 -88
- package/cpp/simd-mappings.h +888 -0
- package/cpp/speculative.cpp +4 -4
- package/cpp/unary-ops.cpp +186 -0
- package/cpp/unary-ops.h +28 -0
- package/cpp/vec.cpp +258 -0
- package/cpp/vec.h +802 -0
- package/ios/CMakeLists.txt +5 -2
- package/ios/RNLlama.mm +2 -2
- package/ios/RNLlamaContext.mm +40 -24
- package/package.json +1 -1
- package/src/NativeRNLlama.ts +6 -4
- package/src/index.ts +3 -1
- package/cpp/chat-template.hpp +0 -529
- package/cpp/minja.hpp +0 -2915
package/cpp/ggml.c
CHANGED
@@ -942,6 +942,7 @@ static const char * LM_GGML_OP_NAME[LM_GGML_OP_COUNT] = {
|
|
942
942
|
"RMS_NORM",
|
943
943
|
"RMS_NORM_BACK",
|
944
944
|
"GROUP_NORM",
|
945
|
+
"L2_NORM",
|
945
946
|
|
946
947
|
"MUL_MAT",
|
947
948
|
"MUL_MAT_ID",
|
@@ -990,6 +991,7 @@ static const char * LM_GGML_OP_NAME[LM_GGML_OP_COUNT] = {
|
|
990
991
|
"ADD_REL_POS",
|
991
992
|
"RWKV_WKV6",
|
992
993
|
"GATED_LINEAR_ATTN",
|
994
|
+
"RWKV_WKV7",
|
993
995
|
|
994
996
|
"UNARY",
|
995
997
|
|
@@ -1009,7 +1011,7 @@ static const char * LM_GGML_OP_NAME[LM_GGML_OP_COUNT] = {
|
|
1009
1011
|
"OPT_STEP_ADAMW",
|
1010
1012
|
};
|
1011
1013
|
|
1012
|
-
static_assert(LM_GGML_OP_COUNT ==
|
1014
|
+
static_assert(LM_GGML_OP_COUNT == 85, "LM_GGML_OP_COUNT != 85");
|
1013
1015
|
|
1014
1016
|
static const char * LM_GGML_OP_SYMBOL[LM_GGML_OP_COUNT] = {
|
1015
1017
|
"none",
|
@@ -1039,6 +1041,7 @@ static const char * LM_GGML_OP_SYMBOL[LM_GGML_OP_COUNT] = {
|
|
1039
1041
|
"rms_norm(x)",
|
1040
1042
|
"rms_norm_back(x)",
|
1041
1043
|
"group_norm(x)",
|
1044
|
+
"l2_norm(x)",
|
1042
1045
|
|
1043
1046
|
"X*Y",
|
1044
1047
|
"X[i]*Y",
|
@@ -1087,6 +1090,7 @@ static const char * LM_GGML_OP_SYMBOL[LM_GGML_OP_COUNT] = {
|
|
1087
1090
|
"add_rel_pos(x)",
|
1088
1091
|
"rwkv_wkv6(k, v, r, tf, td, s)",
|
1089
1092
|
"gated_linear_attn(k, v, q, gate, s)",
|
1093
|
+
"rwkv_wkv7(r, w, k, v, a, b, s)",
|
1090
1094
|
|
1091
1095
|
"unary(x)",
|
1092
1096
|
|
@@ -1106,7 +1110,7 @@ static const char * LM_GGML_OP_SYMBOL[LM_GGML_OP_COUNT] = {
|
|
1106
1110
|
"adamw(x)",
|
1107
1111
|
};
|
1108
1112
|
|
1109
|
-
static_assert(LM_GGML_OP_COUNT ==
|
1113
|
+
static_assert(LM_GGML_OP_COUNT == 85, "LM_GGML_OP_COUNT != 85");
|
1110
1114
|
|
1111
1115
|
static_assert(LM_GGML_OP_POOL_COUNT == 2, "LM_GGML_OP_POOL_COUNT != 2");
|
1112
1116
|
|
@@ -1168,6 +1172,12 @@ int64_t lm_ggml_nrows(const struct lm_ggml_tensor * tensor) {
|
|
1168
1172
|
}
|
1169
1173
|
|
1170
1174
|
size_t lm_ggml_nbytes(const struct lm_ggml_tensor * tensor) {
|
1175
|
+
for (int i = 0; i < LM_GGML_MAX_DIMS; ++i) {
|
1176
|
+
if (tensor->ne[i] <= 0) {
|
1177
|
+
return 0;
|
1178
|
+
}
|
1179
|
+
}
|
1180
|
+
|
1171
1181
|
size_t nbytes;
|
1172
1182
|
const size_t blck_size = lm_ggml_blck_size(tensor->type);
|
1173
1183
|
if (blck_size == 1) {
|
@@ -2699,6 +2709,37 @@ struct lm_ggml_tensor * lm_ggml_group_norm_inplace(
|
|
2699
2709
|
return lm_ggml_group_norm_impl(ctx, a, n_groups, eps, true);
|
2700
2710
|
}
|
2701
2711
|
|
2712
|
+
// lm_ggml_l2_norm
|
2713
|
+
|
2714
|
+
static struct lm_ggml_tensor * lm_ggml_l2_norm_impl(
|
2715
|
+
struct lm_ggml_context * ctx,
|
2716
|
+
struct lm_ggml_tensor * a,
|
2717
|
+
float eps,
|
2718
|
+
bool inplace) {
|
2719
|
+
struct lm_ggml_tensor * result = inplace ? lm_ggml_view_tensor(ctx, a) : lm_ggml_dup_tensor(ctx, a);
|
2720
|
+
|
2721
|
+
lm_ggml_set_op_params_f32(result, 0, eps);
|
2722
|
+
|
2723
|
+
result->op = LM_GGML_OP_L2_NORM;
|
2724
|
+
result->src[0] = a;
|
2725
|
+
|
2726
|
+
return result;
|
2727
|
+
}
|
2728
|
+
|
2729
|
+
struct lm_ggml_tensor * lm_ggml_l2_norm(
|
2730
|
+
struct lm_ggml_context * ctx,
|
2731
|
+
struct lm_ggml_tensor * a,
|
2732
|
+
float eps) {
|
2733
|
+
return lm_ggml_l2_norm_impl(ctx, a, eps, false);
|
2734
|
+
}
|
2735
|
+
|
2736
|
+
struct lm_ggml_tensor * lm_ggml_l2_norm_inplace(
|
2737
|
+
struct lm_ggml_context * ctx,
|
2738
|
+
struct lm_ggml_tensor * a,
|
2739
|
+
float eps) {
|
2740
|
+
return lm_ggml_l2_norm_impl(ctx, a, eps, true);
|
2741
|
+
}
|
2742
|
+
|
2702
2743
|
// lm_ggml_mul_mat
|
2703
2744
|
|
2704
2745
|
static inline bool lm_ggml_can_mul_mat(const struct lm_ggml_tensor * t0, const struct lm_ggml_tensor * t1) {
|
@@ -4347,7 +4388,7 @@ struct lm_ggml_tensor * lm_ggml_flash_attn_ext(
|
|
4347
4388
|
}
|
4348
4389
|
|
4349
4390
|
// permute(0, 2, 1, 3)
|
4350
|
-
int64_t ne[4] = {
|
4391
|
+
int64_t ne[4] = { v->ne[0], q->ne[2], q->ne[1], q->ne[3] };
|
4351
4392
|
struct lm_ggml_tensor * result = lm_ggml_new_tensor(ctx, LM_GGML_TYPE_F32, 4, ne);
|
4352
4393
|
|
4353
4394
|
float params[] = { scale, max_bias, logit_softcap };
|
@@ -4733,6 +4774,54 @@ struct lm_ggml_tensor * lm_ggml_gated_linear_attn(
|
|
4733
4774
|
return result;
|
4734
4775
|
}
|
4735
4776
|
|
4777
|
+
// lm_ggml_rwkv_wkv7
|
4778
|
+
|
4779
|
+
struct lm_ggml_tensor * lm_ggml_rwkv_wkv7(
|
4780
|
+
struct lm_ggml_context * ctx,
|
4781
|
+
struct lm_ggml_tensor * r,
|
4782
|
+
struct lm_ggml_tensor * w,
|
4783
|
+
struct lm_ggml_tensor * k,
|
4784
|
+
struct lm_ggml_tensor * v,
|
4785
|
+
struct lm_ggml_tensor * a,
|
4786
|
+
struct lm_ggml_tensor * b,
|
4787
|
+
struct lm_ggml_tensor * state) {
|
4788
|
+
LM_GGML_ASSERT(lm_ggml_is_contiguous(r));
|
4789
|
+
LM_GGML_ASSERT(lm_ggml_is_contiguous(w));
|
4790
|
+
LM_GGML_ASSERT(lm_ggml_is_contiguous(k));
|
4791
|
+
LM_GGML_ASSERT(lm_ggml_is_contiguous(v));
|
4792
|
+
LM_GGML_ASSERT(lm_ggml_is_contiguous(a));
|
4793
|
+
LM_GGML_ASSERT(lm_ggml_is_contiguous(b));
|
4794
|
+
LM_GGML_ASSERT(lm_ggml_is_contiguous(state));
|
4795
|
+
|
4796
|
+
const int64_t S = k->ne[0];
|
4797
|
+
const int64_t H = k->ne[1];
|
4798
|
+
const int64_t n_tokens = k->ne[2];
|
4799
|
+
const int64_t n_seqs = state->ne[1];
|
4800
|
+
{
|
4801
|
+
LM_GGML_ASSERT(w->ne[0] == S && w->ne[1] == H && w->ne[2] == n_tokens);
|
4802
|
+
LM_GGML_ASSERT(k->ne[0] == S && k->ne[1] == H && k->ne[2] == n_tokens);
|
4803
|
+
LM_GGML_ASSERT(v->ne[0] == S && v->ne[1] == H && v->ne[2] == n_tokens);
|
4804
|
+
LM_GGML_ASSERT(a->ne[0] == S && a->ne[1] == H && a->ne[2] == n_tokens);
|
4805
|
+
LM_GGML_ASSERT(b->ne[0] == S && b->ne[1] == H && b->ne[2] == n_tokens);
|
4806
|
+
LM_GGML_ASSERT(lm_ggml_nelements(state) == S * S * H * n_seqs);
|
4807
|
+
}
|
4808
|
+
|
4809
|
+
// concat output and new_state
|
4810
|
+
const int64_t ne[4] = { S * H, n_tokens + S * n_seqs, 1, 1 };
|
4811
|
+
struct lm_ggml_tensor * result = lm_ggml_new_tensor(ctx, LM_GGML_TYPE_F32, 4, ne);
|
4812
|
+
|
4813
|
+
result->op = LM_GGML_OP_RWKV_WKV7;
|
4814
|
+
result->src[0] = r;
|
4815
|
+
result->src[1] = w;
|
4816
|
+
result->src[2] = k;
|
4817
|
+
result->src[3] = v;
|
4818
|
+
result->src[4] = a;
|
4819
|
+
result->src[5] = b;
|
4820
|
+
result->src[6] = state;
|
4821
|
+
|
4822
|
+
return result;
|
4823
|
+
}
|
4824
|
+
|
4736
4825
|
// lm_ggml_unary
|
4737
4826
|
|
4738
4827
|
static struct lm_ggml_tensor * lm_ggml_unary_impl(
|
package/cpp/ggml.h
CHANGED
@@ -455,6 +455,7 @@ extern "C" {
|
|
455
455
|
LM_GGML_OP_RMS_NORM,
|
456
456
|
LM_GGML_OP_RMS_NORM_BACK,
|
457
457
|
LM_GGML_OP_GROUP_NORM,
|
458
|
+
LM_GGML_OP_L2_NORM,
|
458
459
|
|
459
460
|
LM_GGML_OP_MUL_MAT,
|
460
461
|
LM_GGML_OP_MUL_MAT_ID,
|
@@ -503,6 +504,7 @@ extern "C" {
|
|
503
504
|
LM_GGML_OP_ADD_REL_POS,
|
504
505
|
LM_GGML_OP_RWKV_WKV6,
|
505
506
|
LM_GGML_OP_GATED_LINEAR_ATTN,
|
507
|
+
LM_GGML_OP_RWKV_WKV7,
|
506
508
|
|
507
509
|
LM_GGML_OP_UNARY,
|
508
510
|
|
@@ -1096,6 +1098,18 @@ extern "C" {
|
|
1096
1098
|
int n_groups,
|
1097
1099
|
float eps);
|
1098
1100
|
|
1101
|
+
// l2 normalize along rows
|
1102
|
+
// used in rwkv v7
|
1103
|
+
LM_GGML_API struct lm_ggml_tensor * lm_ggml_l2_norm(
|
1104
|
+
struct lm_ggml_context * ctx,
|
1105
|
+
struct lm_ggml_tensor * a,
|
1106
|
+
float eps);
|
1107
|
+
|
1108
|
+
LM_GGML_API struct lm_ggml_tensor * lm_ggml_l2_norm_inplace(
|
1109
|
+
struct lm_ggml_context * ctx,
|
1110
|
+
struct lm_ggml_tensor * a,
|
1111
|
+
float eps);
|
1112
|
+
|
1099
1113
|
// a - x
|
1100
1114
|
// b - dy
|
1101
1115
|
LM_GGML_API struct lm_ggml_tensor * lm_ggml_rms_norm_back(
|
@@ -1778,11 +1792,11 @@ extern "C" {
|
|
1778
1792
|
|
1779
1793
|
#define LM_GGML_KQ_MASK_PAD 64
|
1780
1794
|
|
1781
|
-
// q: [
|
1782
|
-
// k: [
|
1783
|
-
// v: [
|
1784
|
-
// mask: [n_kv,
|
1785
|
-
// res: [
|
1795
|
+
// q: [n_embd_k, n_batch, n_head, 1]
|
1796
|
+
// k: [n_embd_k, n_kv, n_head_kv, 1]
|
1797
|
+
// v: [n_embd_v, n_kv, n_head_kv, 1] !! not transposed !!
|
1798
|
+
// mask: [n_kv, n_batch_pad, 1, 1] !! n_batch_pad = LM_GGML_PAD(n_batch, LM_GGML_KQ_MASK_PAD) !!
|
1799
|
+
// res: [n_embd_v, n_head, n_batch, 1] !! permuted !!
|
1786
1800
|
LM_GGML_API struct lm_ggml_tensor * lm_ggml_flash_attn_ext(
|
1787
1801
|
struct lm_ggml_context * ctx,
|
1788
1802
|
struct lm_ggml_tensor * q,
|
@@ -1891,6 +1905,16 @@ extern "C" {
|
|
1891
1905
|
struct lm_ggml_tensor * state,
|
1892
1906
|
float scale);
|
1893
1907
|
|
1908
|
+
LM_GGML_API struct lm_ggml_tensor * lm_ggml_rwkv_wkv7(
|
1909
|
+
struct lm_ggml_context * ctx,
|
1910
|
+
struct lm_ggml_tensor * r,
|
1911
|
+
struct lm_ggml_tensor * w,
|
1912
|
+
struct lm_ggml_tensor * k,
|
1913
|
+
struct lm_ggml_tensor * v,
|
1914
|
+
struct lm_ggml_tensor * a,
|
1915
|
+
struct lm_ggml_tensor * b,
|
1916
|
+
struct lm_ggml_tensor * state);
|
1917
|
+
|
1894
1918
|
// custom operators
|
1895
1919
|
|
1896
1920
|
typedef void (*lm_ggml_unary_op_f32_t) (const int, float *, const float *);
|
package/cpp/gguf.cpp
CHANGED
@@ -932,6 +932,7 @@ static void lm_gguf_check_reserved_keys(const std::string & key, const T val) {
|
|
932
932
|
if constexpr (std::is_same<T, uint32_t>::value) {
|
933
933
|
LM_GGML_ASSERT(val > 0 && (val & (val - 1)) == 0 && LM_GGUF_KEY_GENERAL_ALIGNMENT " must be power of 2");
|
934
934
|
} else {
|
935
|
+
LM_GGML_UNUSED(val);
|
935
936
|
LM_GGML_ABORT(LM_GGUF_KEY_GENERAL_ALIGNMENT " must be type u32");
|
936
937
|
}
|
937
938
|
}
|
package/cpp/llama-adapter.cpp
CHANGED
@@ -4,14 +4,13 @@
|
|
4
4
|
#include "llama-mmap.h"
|
5
5
|
#include "llama-model.h"
|
6
6
|
|
7
|
-
#include <algorithm>
|
8
7
|
#include <map>
|
9
8
|
#include <cassert>
|
10
9
|
#include <stdexcept>
|
11
10
|
|
12
11
|
// vec
|
13
12
|
|
14
|
-
|
13
|
+
lm_ggml_tensor * llama_adapter_cvec::tensor_for(int il) const {
|
15
14
|
if (il < 0 || il < layer_start || il > layer_end || (size_t) il >= tensors.size()) {
|
16
15
|
return nullptr;
|
17
16
|
}
|
@@ -19,7 +18,7 @@ struct lm_ggml_tensor * llama_adapter_cvec::tensor_for(int il) const {
|
|
19
18
|
return tensors[il];
|
20
19
|
}
|
21
20
|
|
22
|
-
|
21
|
+
lm_ggml_tensor * llama_adapter_cvec::apply_to(lm_ggml_context * ctx, lm_ggml_tensor * cur, int il) const {
|
23
22
|
lm_ggml_tensor * layer_dir = tensor_for(il);
|
24
23
|
if (layer_dir != nullptr) {
|
25
24
|
cur = lm_ggml_add(ctx, cur, layer_dir);
|
@@ -40,7 +39,7 @@ bool llama_adapter_cvec::init(const llama_model & model) {
|
|
40
39
|
auto ctx_for_buft = [&](lm_ggml_backend_buffer_type_t buft) -> lm_ggml_context * {
|
41
40
|
auto it = ctx_map.find(buft);
|
42
41
|
if (it == ctx_map.end()) {
|
43
|
-
|
42
|
+
lm_ggml_init_params params = {
|
44
43
|
/*.mem_size =*/ hparams.n_layer*lm_ggml_tensor_overhead(),
|
45
44
|
/*.mem_buffer =*/ NULL,
|
46
45
|
/*.no_alloc =*/ true,
|
@@ -91,7 +90,7 @@ bool llama_adapter_cvec::init(const llama_model & model) {
|
|
91
90
|
return true;
|
92
91
|
}
|
93
92
|
|
94
|
-
|
93
|
+
bool llama_adapter_cvec::apply(
|
95
94
|
const llama_model & model,
|
96
95
|
const float * data,
|
97
96
|
size_t len,
|
@@ -104,17 +103,17 @@ int32_t llama_adapter_cvec::apply(
|
|
104
103
|
// disable the current control vector (but leave allocated for later)
|
105
104
|
layer_start = -1;
|
106
105
|
layer_end = -1;
|
107
|
-
return
|
106
|
+
return true;
|
108
107
|
}
|
109
108
|
|
110
109
|
if (n_embd != (int) hparams.n_embd) {
|
111
110
|
LLAMA_LOG_ERROR("%s: control vector n_embd does not match model\n", __func__);
|
112
|
-
return
|
111
|
+
return false;
|
113
112
|
}
|
114
113
|
|
115
114
|
if (tensors.empty()) {
|
116
115
|
if (!init(model)) {
|
117
|
-
return
|
116
|
+
return false;
|
118
117
|
}
|
119
118
|
}
|
120
119
|
|
@@ -130,12 +129,12 @@ int32_t llama_adapter_cvec::apply(
|
|
130
129
|
}
|
131
130
|
}
|
132
131
|
|
133
|
-
return
|
132
|
+
return true;
|
134
133
|
}
|
135
134
|
|
136
135
|
// lora
|
137
136
|
|
138
|
-
llama_adapter_lora_weight * llama_adapter_lora::get_weight(
|
137
|
+
llama_adapter_lora_weight * llama_adapter_lora::get_weight(lm_ggml_tensor * w) {
|
139
138
|
const std::string name(w->name);
|
140
139
|
|
141
140
|
const auto pos = ab_map.find(name);
|
@@ -146,11 +145,11 @@ llama_adapter_lora_weight * llama_adapter_lora::get_weight(struct lm_ggml_tensor
|
|
146
145
|
return nullptr;
|
147
146
|
}
|
148
147
|
|
149
|
-
static void llama_adapter_lora_init_impl(
|
148
|
+
static void llama_adapter_lora_init_impl(llama_model & model, const char * path_lora, llama_adapter_lora & adapter) {
|
150
149
|
LLAMA_LOG_INFO("%s: loading lora adapter from '%s' ...\n", __func__, path_lora);
|
151
150
|
|
152
151
|
lm_ggml_context * ctx_init;
|
153
|
-
|
152
|
+
lm_gguf_init_params meta_lm_gguf_params = {
|
154
153
|
/* .no_alloc = */ true,
|
155
154
|
/* .ctx = */ &ctx_init,
|
156
155
|
};
|
@@ -201,7 +200,7 @@ static void llama_adapter_lora_init_impl(struct llama_model & model, const char
|
|
201
200
|
auto it = ctx_map.find(buft);
|
202
201
|
if (it == ctx_map.end()) {
|
203
202
|
// add a new context
|
204
|
-
|
203
|
+
lm_ggml_init_params params = {
|
205
204
|
/*.mem_size =*/ n_tensors*lm_ggml_tensor_overhead(),
|
206
205
|
/*.mem_buffer =*/ NULL,
|
207
206
|
/*.no_alloc =*/ true,
|
@@ -248,6 +247,26 @@ static void llama_adapter_lora_init_impl(struct llama_model & model, const char
|
|
248
247
|
}
|
249
248
|
}
|
250
249
|
|
250
|
+
// get extra buffer types of the CPU
|
251
|
+
// TODO: a more general solution for non-CPU extra buft should be imlpemented in the future
|
252
|
+
// ref: https://github.com/ggml-org/llama.cpp/pull/12593#pullrequestreview-2718659948
|
253
|
+
std::vector<lm_ggml_backend_buffer_type_t> buft_extra;
|
254
|
+
{
|
255
|
+
auto * cpu_dev = lm_ggml_backend_dev_by_type(LM_GGML_BACKEND_DEVICE_TYPE_CPU);
|
256
|
+
auto * cpu_reg = lm_ggml_backend_dev_backend_reg(cpu_dev);
|
257
|
+
|
258
|
+
auto lm_ggml_backend_dev_get_extra_bufts_fn = (lm_ggml_backend_dev_get_extra_bufts_t)
|
259
|
+
lm_ggml_backend_reg_get_proc_address(cpu_reg, "lm_ggml_backend_dev_get_extra_bufts");
|
260
|
+
|
261
|
+
if (lm_ggml_backend_dev_get_extra_bufts_fn) {
|
262
|
+
lm_ggml_backend_buffer_type_t * extra_bufts = lm_ggml_backend_dev_get_extra_bufts_fn(cpu_dev);
|
263
|
+
while (extra_bufts && *extra_bufts) {
|
264
|
+
buft_extra.emplace_back(*extra_bufts);
|
265
|
+
++extra_bufts;
|
266
|
+
}
|
267
|
+
}
|
268
|
+
}
|
269
|
+
|
251
270
|
// add tensors
|
252
271
|
for (auto & it : ab_map) {
|
253
272
|
const std::string & name = it.first;
|
@@ -264,7 +283,23 @@ static void llama_adapter_lora_init_impl(struct llama_model & model, const char
|
|
264
283
|
throw std::runtime_error("LoRA tensor '" + name + "' does not exist in base model (hint: maybe wrong base model?)");
|
265
284
|
}
|
266
285
|
|
267
|
-
|
286
|
+
auto * buft = lm_ggml_backend_buffer_get_type(model_tensor->buffer);
|
287
|
+
|
288
|
+
// do not load loras to extra buffer types (i.e. bufts for repacking) -> use the CPU in that case
|
289
|
+
for (auto & ex : buft_extra) {
|
290
|
+
if (ex == buft) {
|
291
|
+
LLAMA_LOG_WARN("%s: lora for '%s' cannot use buft '%s', fallback to CPU\n", __func__, model_tensor->name, lm_ggml_backend_buft_name(buft));
|
292
|
+
|
293
|
+
auto * cpu_dev = lm_ggml_backend_dev_by_type(LM_GGML_BACKEND_DEVICE_TYPE_CPU);
|
294
|
+
buft = lm_ggml_backend_dev_buffer_type(cpu_dev);
|
295
|
+
|
296
|
+
break;
|
297
|
+
}
|
298
|
+
}
|
299
|
+
|
300
|
+
LLAMA_LOG_DEBUG("%s: lora for '%s' -> '%s'\n", __func__, model_tensor->name, lm_ggml_backend_buft_name(buft));
|
301
|
+
|
302
|
+
lm_ggml_context * dev_ctx = ctx_for_buft(buft);
|
268
303
|
// validate tensor shape
|
269
304
|
if (is_token_embd) {
|
270
305
|
// expect B to be non-transposed, A and B are flipped; see llm_build_inp_embd()
|
@@ -281,8 +316,8 @@ static void llama_adapter_lora_init_impl(struct llama_model & model, const char
|
|
281
316
|
}
|
282
317
|
|
283
318
|
// save tensor to adapter
|
284
|
-
|
285
|
-
|
319
|
+
lm_ggml_tensor * tensor_a = lm_ggml_dup_tensor(dev_ctx, w.a);
|
320
|
+
lm_ggml_tensor * tensor_b = lm_ggml_dup_tensor(dev_ctx, w.b);
|
286
321
|
lm_ggml_set_name(tensor_a, w.a->name);
|
287
322
|
lm_ggml_set_name(tensor_b, w.b->name);
|
288
323
|
adapter.ab_map[name] = llama_adapter_lora_weight(tensor_a, tensor_b);
|
@@ -308,7 +343,7 @@ static void llama_adapter_lora_init_impl(struct llama_model & model, const char
|
|
308
343
|
{
|
309
344
|
llama_file lm_gguf_file(path_lora, "rb");
|
310
345
|
std::vector<uint8_t> read_buf;
|
311
|
-
auto set_tensor = [&](
|
346
|
+
auto set_tensor = [&](lm_ggml_tensor * orig, lm_ggml_tensor * dev) {
|
312
347
|
size_t offs = lm_gguf_get_data_offset(ctx_gguf.get()) + lm_gguf_get_tensor_offset(ctx_gguf.get(), lm_gguf_find_tensor(ctx_gguf.get(), orig->name));
|
313
348
|
size_t size = lm_ggml_nbytes(orig);
|
314
349
|
read_buf.resize(size);
|
@@ -327,8 +362,8 @@ static void llama_adapter_lora_init_impl(struct llama_model & model, const char
|
|
327
362
|
LLAMA_LOG_INFO("%s: loaded %zu tensors from lora file\n", __func__, adapter.ab_map.size()*2);
|
328
363
|
}
|
329
364
|
|
330
|
-
|
331
|
-
|
365
|
+
llama_adapter_lora * llama_adapter_lora_init(llama_model * model, const char * path_lora) {
|
366
|
+
llama_adapter_lora * adapter = new llama_adapter_lora();
|
332
367
|
|
333
368
|
try {
|
334
369
|
llama_adapter_lora_init_impl(*model, path_lora, *adapter);
|
@@ -342,6 +377,6 @@ struct llama_adapter_lora * llama_adapter_lora_init(struct llama_model * model,
|
|
342
377
|
return nullptr;
|
343
378
|
}
|
344
379
|
|
345
|
-
void llama_adapter_lora_free(
|
380
|
+
void llama_adapter_lora_free(llama_adapter_lora * adapter) {
|
346
381
|
delete adapter;
|
347
382
|
}
|
package/cpp/llama-adapter.h
CHANGED
@@ -15,11 +15,11 @@
|
|
15
15
|
//
|
16
16
|
|
17
17
|
struct llama_adapter_cvec {
|
18
|
-
|
18
|
+
lm_ggml_tensor * tensor_for(int il) const;
|
19
19
|
|
20
|
-
|
20
|
+
lm_ggml_tensor * apply_to(lm_ggml_context * ctx, lm_ggml_tensor * cur, int il) const;
|
21
21
|
|
22
|
-
|
22
|
+
bool apply(
|
23
23
|
const llama_model & model,
|
24
24
|
const float * data,
|
25
25
|
size_t len,
|
@@ -36,7 +36,7 @@ private:
|
|
36
36
|
std::vector<lm_ggml_context_ptr> ctxs;
|
37
37
|
std::vector<lm_ggml_backend_buffer_ptr> bufs;
|
38
38
|
|
39
|
-
std::vector<
|
39
|
+
std::vector<lm_ggml_tensor *> tensors; // per layer
|
40
40
|
};
|
41
41
|
|
42
42
|
//
|
@@ -44,8 +44,8 @@ private:
|
|
44
44
|
//
|
45
45
|
|
46
46
|
struct llama_adapter_lora_weight {
|
47
|
-
|
48
|
-
|
47
|
+
lm_ggml_tensor * a = nullptr;
|
48
|
+
lm_ggml_tensor * b = nullptr;
|
49
49
|
|
50
50
|
// get actual scale based on rank and alpha
|
51
51
|
float get_scale(float alpha, float adapter_scale) const {
|
@@ -55,12 +55,12 @@ struct llama_adapter_lora_weight {
|
|
55
55
|
}
|
56
56
|
|
57
57
|
llama_adapter_lora_weight() = default;
|
58
|
-
llama_adapter_lora_weight(
|
58
|
+
llama_adapter_lora_weight(lm_ggml_tensor * a, lm_ggml_tensor * b) : a(a), b(b) {}
|
59
59
|
};
|
60
60
|
|
61
61
|
struct llama_adapter_lora {
|
62
62
|
// map tensor name to lora_a_b
|
63
|
-
std::unordered_map<std::string,
|
63
|
+
std::unordered_map<std::string, llama_adapter_lora_weight> ab_map;
|
64
64
|
|
65
65
|
std::vector<lm_ggml_context_ptr> ctxs;
|
66
66
|
std::vector<lm_ggml_backend_buffer_ptr> bufs;
|
@@ -70,5 +70,7 @@ struct llama_adapter_lora {
|
|
70
70
|
llama_adapter_lora() = default;
|
71
71
|
~llama_adapter_lora() = default;
|
72
72
|
|
73
|
-
llama_adapter_lora_weight * get_weight(
|
73
|
+
llama_adapter_lora_weight * get_weight(lm_ggml_tensor * w);
|
74
74
|
};
|
75
|
+
|
76
|
+
using llama_adapter_loras = std::unordered_map<llama_adapter_lora *, float>;
|