cui-llama.rn 1.0.1 → 1.0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +10 -1
- package/android/src/main/CMakeLists.txt +22 -19
- package/android/src/main/java/com/rnllama/LlamaContext.java +62 -20
- package/cpp/common.cpp +4 -11
- package/cpp/common.h +1 -1
- package/cpp/ggml-aarch64.c +2193 -2193
- package/cpp/ggml-aarch64.h +39 -39
- package/cpp/ggml-alloc.c +1042 -1041
- package/cpp/ggml-backend-impl.h +153 -153
- package/cpp/ggml-backend.c +2234 -2225
- package/cpp/ggml-backend.h +238 -236
- package/cpp/ggml-common.h +1829 -1829
- package/cpp/ggml-impl.h +655 -655
- package/cpp/ggml-metal.h +65 -65
- package/cpp/ggml-metal.m +3269 -3273
- package/cpp/ggml-quants.c +14860 -15022
- package/cpp/ggml-quants.h +132 -132
- package/cpp/ggml.c +16 -6
- package/cpp/ggml.h +2447 -2444
- package/cpp/llama.cpp +634 -531
- package/cpp/llama.h +30 -14
- package/cpp/log.h +737 -737
- package/cpp/rn-llama.hpp +9 -1
- package/cpp/sampling.cpp +460 -460
- package/cpp/sgemm.cpp +1027 -1027
- package/cpp/sgemm.h +14 -14
- package/package.json +1 -1
package/cpp/llama.cpp
CHANGED
@@ -19,6 +19,8 @@
|
|
19
19
|
# include "ggml-sycl.h"
|
20
20
|
#elif defined(LM_GGML_USE_KOMPUTE)
|
21
21
|
# include "ggml-kompute.h"
|
22
|
+
#elif defined(LM_GGML_USE_CANN)
|
23
|
+
# include "ggml-cann.h"
|
22
24
|
#endif
|
23
25
|
|
24
26
|
#ifdef LM_GGML_USE_BLAS
|
@@ -112,7 +114,7 @@
|
|
112
114
|
|
113
115
|
// bump if necessary
|
114
116
|
#define LLAMA_MAX_NODES 8192
|
115
|
-
#define LLAMA_MAX_LAYERS
|
117
|
+
#define LLAMA_MAX_LAYERS 512
|
116
118
|
#define LLAMA_MAX_EXPERTS 160 // DeepSeekV2
|
117
119
|
|
118
120
|
//
|
@@ -298,6 +300,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
|
298
300
|
};
|
299
301
|
|
300
302
|
enum llm_kv {
|
303
|
+
LLM_KV_GENERAL_TYPE,
|
301
304
|
LLM_KV_GENERAL_ARCHITECTURE,
|
302
305
|
LLM_KV_GENERAL_QUANTIZATION_VERSION,
|
303
306
|
LLM_KV_GENERAL_ALIGNMENT,
|
@@ -388,9 +391,13 @@ enum llm_kv {
|
|
388
391
|
LLM_KV_TOKENIZER_SUFFIX_ID,
|
389
392
|
LLM_KV_TOKENIZER_MIDDLE_ID,
|
390
393
|
LLM_KV_TOKENIZER_EOT_ID,
|
394
|
+
|
395
|
+
LLM_KV_ADAPTER_TYPE,
|
396
|
+
LLM_KV_ADAPTER_LORA_ALPHA,
|
391
397
|
};
|
392
398
|
|
393
399
|
static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
400
|
+
{ LLM_KV_GENERAL_TYPE, "general.type" },
|
394
401
|
{ LLM_KV_GENERAL_ARCHITECTURE, "general.architecture" },
|
395
402
|
{ LLM_KV_GENERAL_QUANTIZATION_VERSION, "general.quantization_version" },
|
396
403
|
{ LLM_KV_GENERAL_ALIGNMENT, "general.alignment" },
|
@@ -481,6 +488,9 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
|
481
488
|
{ LLM_KV_TOKENIZER_SUFFIX_ID, "tokenizer.ggml.suffix_token_id" },
|
482
489
|
{ LLM_KV_TOKENIZER_MIDDLE_ID, "tokenizer.ggml.middle_token_id" },
|
483
490
|
{ LLM_KV_TOKENIZER_EOT_ID, "tokenizer.ggml.eot_token_id" },
|
491
|
+
|
492
|
+
{ LLM_KV_ADAPTER_TYPE, "adapter.type" },
|
493
|
+
{ LLM_KV_ADAPTER_LORA_ALPHA, "adapter.lora.alpha" },
|
484
494
|
};
|
485
495
|
|
486
496
|
struct LLM_KV {
|
@@ -2082,6 +2092,8 @@ struct llama_state {
|
|
2082
2092
|
lm_ggml_backend_metal_log_set_callback(log_callback, log_callback_user_data);
|
2083
2093
|
#elif defined(LM_GGML_USE_CUDA)
|
2084
2094
|
lm_ggml_backend_cuda_log_set_callback(log_callback, log_callback_user_data);
|
2095
|
+
#elif defined(LM_GGML_USE_CANN)
|
2096
|
+
lm_ggml_backend_cann_log_set_callback(log_callback, log_callback_user_data);
|
2085
2097
|
#endif
|
2086
2098
|
}
|
2087
2099
|
|
@@ -2714,6 +2726,9 @@ struct llama_model {
|
|
2714
2726
|
int64_t t_load_us = 0;
|
2715
2727
|
int64_t t_start_us = 0;
|
2716
2728
|
|
2729
|
+
// keep track of loaded lora adapters
|
2730
|
+
std::set<struct llama_lora_adapter *> lora_adapters;
|
2731
|
+
|
2717
2732
|
~llama_model() {
|
2718
2733
|
for (struct lm_ggml_context * ctx : ctxs) {
|
2719
2734
|
lm_ggml_free(ctx);
|
@@ -2726,6 +2741,9 @@ struct llama_model {
|
|
2726
2741
|
#endif
|
2727
2742
|
lm_ggml_backend_buffer_free(buf);
|
2728
2743
|
}
|
2744
|
+
while (!lora_adapters.empty()) {
|
2745
|
+
llama_lora_adapter_free(*lora_adapters.begin());
|
2746
|
+
}
|
2729
2747
|
}
|
2730
2748
|
};
|
2731
2749
|
|
@@ -2830,6 +2848,52 @@ struct llama_context {
|
|
2830
2848
|
|
2831
2849
|
// control vectors
|
2832
2850
|
struct llama_control_vector cvec;
|
2851
|
+
|
2852
|
+
// lora adapters and scales
|
2853
|
+
std::unordered_map<struct llama_lora_adapter *, float> lora_adapters;
|
2854
|
+
};
|
2855
|
+
|
2856
|
+
struct llama_lora_weight {
|
2857
|
+
struct lm_ggml_tensor * a = nullptr;
|
2858
|
+
struct lm_ggml_tensor * b = nullptr;
|
2859
|
+
llama_lora_weight() = default;
|
2860
|
+
llama_lora_weight(struct lm_ggml_tensor * a, struct lm_ggml_tensor * b): a(a), b(b) {}
|
2861
|
+
};
|
2862
|
+
|
2863
|
+
struct llama_lora_adapter {
|
2864
|
+
struct llama_model * base_model;
|
2865
|
+
// map tensor name to lora_a_b
|
2866
|
+
std::unordered_map<std::string, struct llama_lora_weight> ab_map;
|
2867
|
+
std::vector<struct lm_ggml_context *> ctxs;
|
2868
|
+
std::vector<lm_ggml_backend_buffer_t> bufs;
|
2869
|
+
|
2870
|
+
float alpha;
|
2871
|
+
|
2872
|
+
llama_lora_adapter(struct llama_model * base_model): base_model(base_model) {
|
2873
|
+
base_model->lora_adapters.insert(this);
|
2874
|
+
}
|
2875
|
+
|
2876
|
+
llama_lora_weight * get_weight(struct lm_ggml_tensor * w) {
|
2877
|
+
std::string name(w->name);
|
2878
|
+
auto pos = ab_map.find(name);
|
2879
|
+
if (ab_map.find(name) != ab_map.end()) {
|
2880
|
+
return &pos->second;
|
2881
|
+
}
|
2882
|
+
return nullptr;
|
2883
|
+
}
|
2884
|
+
|
2885
|
+
~llama_lora_adapter() {
|
2886
|
+
for (struct lm_ggml_context * ctx : ctxs) {
|
2887
|
+
lm_ggml_free(ctx);
|
2888
|
+
}
|
2889
|
+
for (lm_ggml_backend_buffer_t buf : bufs) {
|
2890
|
+
lm_ggml_backend_buffer_free(buf);
|
2891
|
+
}
|
2892
|
+
auto pos = base_model->lora_adapters.find(this);
|
2893
|
+
if (pos != base_model->lora_adapters.end()) {
|
2894
|
+
base_model->lora_adapters.erase(pos);
|
2895
|
+
}
|
2896
|
+
}
|
2833
2897
|
};
|
2834
2898
|
|
2835
2899
|
static size_t llama_get_device_count(const llama_model & model) {
|
@@ -2840,6 +2904,8 @@ static size_t llama_get_device_count(const llama_model & model) {
|
|
2840
2904
|
count = lm_ggml_backend_sycl_get_device_count();
|
2841
2905
|
#elif defined(LM_GGML_USE_VULKAN)
|
2842
2906
|
count = lm_ggml_backend_vk_get_device_count();
|
2907
|
+
#elif defined(LM_GGML_USE_CANN)
|
2908
|
+
return lm_ggml_backend_cann_get_device_count();
|
2843
2909
|
#endif
|
2844
2910
|
#if defined(LM_GGML_USE_RPC)
|
2845
2911
|
count += model.rpc_servers.size();
|
@@ -2872,6 +2938,8 @@ static lm_ggml_backend_buffer_type_t llama_default_buffer_type_offload(const lla
|
|
2872
2938
|
if (buft == nullptr) {
|
2873
2939
|
LLAMA_LOG_WARN("%s: cannot use GPU %d, check `vulkaninfo --summary`\n", __func__, gpu);
|
2874
2940
|
}
|
2941
|
+
#elif defined(LM_GGML_USE_CANN)
|
2942
|
+
buft = lm_ggml_backend_cann_buffer_type(gpu);
|
2875
2943
|
#endif
|
2876
2944
|
|
2877
2945
|
if (buft == nullptr) {
|
@@ -2932,6 +3000,11 @@ static size_t llama_get_device_memory(const llama_model & model, int device) {
|
|
2932
3000
|
size_t free;
|
2933
3001
|
lm_ggml_backend_vk_get_device_memory(device, &free, &total);
|
2934
3002
|
return free;
|
3003
|
+
#elif defined(LM_GGML_USE_CANN)
|
3004
|
+
size_t total;
|
3005
|
+
size_t free;
|
3006
|
+
lm_ggml_backend_cann_get_device_memory(device, &total, &free);
|
3007
|
+
return free;
|
2935
3008
|
#else
|
2936
3009
|
return 1;
|
2937
3010
|
#endif
|
@@ -3645,7 +3718,7 @@ struct llama_model_loader {
|
|
3645
3718
|
}
|
3646
3719
|
|
3647
3720
|
if (param_overrides_p != nullptr) {
|
3648
|
-
for (const struct llama_model_kv_override *p = param_overrides_p; p->key[0] != 0; p++) {
|
3721
|
+
for (const struct llama_model_kv_override * p = param_overrides_p; p->key[0] != 0; p++) {
|
3649
3722
|
kv_overrides.insert({std::string(p->key), *p});
|
3650
3723
|
}
|
3651
3724
|
}
|
@@ -3813,7 +3886,7 @@ struct llama_model_loader {
|
|
3813
3886
|
ftype = (llama_ftype) (ftype | LLAMA_FTYPE_GUESSED);
|
3814
3887
|
|
3815
3888
|
{
|
3816
|
-
const int kid = lm_gguf_find_key(meta, "general.file_type");
|
3889
|
+
const int kid = lm_gguf_find_key(meta, "general.file_type"); // TODO: use LLM_KV
|
3817
3890
|
if (kid >= 0) {
|
3818
3891
|
ftype = (llama_ftype) lm_gguf_get_val_u32(meta, kid);
|
3819
3892
|
}
|
@@ -3945,7 +4018,9 @@ struct llama_model_loader {
|
|
3945
4018
|
throw std::runtime_error(format("%s is not a float32, int32 array", key.c_str()));
|
3946
4019
|
}
|
3947
4020
|
|
3948
|
-
|
4021
|
+
if (arr_info.length > N_MAX) {
|
4022
|
+
throw std::runtime_error(format("array length %u for key %s exceeds max %u", (uint32_t) arr_info.length, key.c_str(), (uint32_t) N_MAX));
|
4023
|
+
}
|
3949
4024
|
|
3950
4025
|
std::copy((const T*)arr_info.data, (const T *)arr_info.data + arr_info.length, result.begin());
|
3951
4026
|
|
@@ -3981,8 +4056,6 @@ struct llama_model_loader {
|
|
3981
4056
|
// get array of n <= N_MAX elements, or a single element repeated n times
|
3982
4057
|
template<typename T, size_t N_MAX>
|
3983
4058
|
bool get_key_or_arr(const std::string & key, std::array<T, N_MAX> & result, uint32_t n, const bool required = true) {
|
3984
|
-
LM_GGML_ASSERT(n <= N_MAX);
|
3985
|
-
|
3986
4059
|
const int kid = lm_gguf_find_key(meta, key.c_str());
|
3987
4060
|
|
3988
4061
|
if (kid < 0) {
|
@@ -3992,6 +4065,10 @@ struct llama_model_loader {
|
|
3992
4065
|
return false;
|
3993
4066
|
}
|
3994
4067
|
|
4068
|
+
if (n > N_MAX) {
|
4069
|
+
throw std::runtime_error(format("n > N_MAX: %u > %u for key %s", (uint32_t) n, (uint32_t) N_MAX, key.c_str()));
|
4070
|
+
}
|
4071
|
+
|
3995
4072
|
if (lm_gguf_get_kv_type(meta, kid) == LM_GGUF_TYPE_ARRAY) {
|
3996
4073
|
struct GGUFMeta::ArrayInfo arr_info =
|
3997
4074
|
GGUFMeta::GKV<GGUFMeta::ArrayInfo>::get_kv(meta, kid);
|
@@ -4461,40 +4538,36 @@ static std::string llama_model_ftype_name(llama_ftype ftype) {
|
|
4461
4538
|
}
|
4462
4539
|
|
4463
4540
|
switch (ftype) {
|
4464
|
-
case LLAMA_FTYPE_ALL_F32:
|
4465
|
-
case LLAMA_FTYPE_MOSTLY_F16:
|
4466
|
-
case LLAMA_FTYPE_MOSTLY_BF16:
|
4467
|
-
case LLAMA_FTYPE_MOSTLY_Q4_0:
|
4468
|
-
case LLAMA_FTYPE_MOSTLY_Q4_1:
|
4469
|
-
case
|
4470
|
-
|
4471
|
-
case
|
4472
|
-
case
|
4473
|
-
case
|
4474
|
-
|
4475
|
-
|
4476
|
-
case
|
4477
|
-
case
|
4478
|
-
case
|
4479
|
-
case
|
4480
|
-
case
|
4481
|
-
case
|
4482
|
-
case
|
4483
|
-
case
|
4484
|
-
case
|
4485
|
-
case
|
4486
|
-
case
|
4487
|
-
case
|
4488
|
-
case
|
4489
|
-
case
|
4490
|
-
case
|
4491
|
-
case
|
4492
|
-
case
|
4493
|
-
case
|
4494
|
-
case LLAMA_FTYPE_MOSTLY_IQ4_NL: return "IQ4_NL - 4.5 bpw";
|
4495
|
-
case LLAMA_FTYPE_MOSTLY_IQ4_XS: return "IQ4_XS - 4.25 bpw";
|
4496
|
-
case LLAMA_FTYPE_MOSTLY_IQ3_S: return "IQ3_S - 3.4375 bpw";
|
4497
|
-
case LLAMA_FTYPE_MOSTLY_IQ3_M: return "IQ3_S mix - 3.66 bpw";
|
4541
|
+
case LLAMA_FTYPE_ALL_F32: return "all F32";
|
4542
|
+
case LLAMA_FTYPE_MOSTLY_F16: return "F16";
|
4543
|
+
case LLAMA_FTYPE_MOSTLY_BF16: return "BF16";
|
4544
|
+
case LLAMA_FTYPE_MOSTLY_Q4_0: return "Q4_0";
|
4545
|
+
case LLAMA_FTYPE_MOSTLY_Q4_1: return "Q4_1";
|
4546
|
+
case LLAMA_FTYPE_MOSTLY_Q5_0: return "Q5_0";
|
4547
|
+
case LLAMA_FTYPE_MOSTLY_Q5_1: return "Q5_1";
|
4548
|
+
case LLAMA_FTYPE_MOSTLY_Q8_0: return "Q8_0";
|
4549
|
+
case LLAMA_FTYPE_MOSTLY_Q2_K: return "Q2_K - Medium";
|
4550
|
+
case LLAMA_FTYPE_MOSTLY_Q2_K_S: return "Q2_K - Small";
|
4551
|
+
case LLAMA_FTYPE_MOSTLY_Q3_K_S: return "Q3_K - Small";
|
4552
|
+
case LLAMA_FTYPE_MOSTLY_Q3_K_M: return "Q3_K - Medium";
|
4553
|
+
case LLAMA_FTYPE_MOSTLY_Q3_K_L: return "Q3_K - Large";
|
4554
|
+
case LLAMA_FTYPE_MOSTLY_Q4_K_S: return "Q4_K - Small";
|
4555
|
+
case LLAMA_FTYPE_MOSTLY_Q4_K_M: return "Q4_K - Medium";
|
4556
|
+
case LLAMA_FTYPE_MOSTLY_Q5_K_S: return "Q5_K - Small";
|
4557
|
+
case LLAMA_FTYPE_MOSTLY_Q5_K_M: return "Q5_K - Medium";
|
4558
|
+
case LLAMA_FTYPE_MOSTLY_Q6_K: return "Q6_K";
|
4559
|
+
case LLAMA_FTYPE_MOSTLY_IQ2_XXS: return "IQ2_XXS - 2.0625 bpw";
|
4560
|
+
case LLAMA_FTYPE_MOSTLY_IQ2_XS: return "IQ2_XS - 2.3125 bpw";
|
4561
|
+
case LLAMA_FTYPE_MOSTLY_IQ2_S: return "IQ2_S - 2.5 bpw";
|
4562
|
+
case LLAMA_FTYPE_MOSTLY_IQ2_M: return "IQ2_M - 2.7 bpw";
|
4563
|
+
case LLAMA_FTYPE_MOSTLY_IQ3_XS: return "IQ3_XS - 3.3 bpw";
|
4564
|
+
case LLAMA_FTYPE_MOSTLY_IQ3_XXS: return "IQ3_XXS - 3.0625 bpw";
|
4565
|
+
case LLAMA_FTYPE_MOSTLY_IQ1_S: return "IQ1_S - 1.5625 bpw";
|
4566
|
+
case LLAMA_FTYPE_MOSTLY_IQ1_M: return "IQ1_M - 1.75 bpw";
|
4567
|
+
case LLAMA_FTYPE_MOSTLY_IQ4_NL: return "IQ4_NL - 4.5 bpw";
|
4568
|
+
case LLAMA_FTYPE_MOSTLY_IQ4_XS: return "IQ4_XS - 4.25 bpw";
|
4569
|
+
case LLAMA_FTYPE_MOSTLY_IQ3_S: return "IQ3_S - 3.4375 bpw";
|
4570
|
+
case LLAMA_FTYPE_MOSTLY_IQ3_M: return "IQ3_S mix - 3.66 bpw";
|
4498
4571
|
case LLAMA_FTYPE_MOSTLY_Q4_0_4_4: return "Q4_0_4_4";
|
4499
4572
|
case LLAMA_FTYPE_MOSTLY_Q4_0_4_8: return "Q4_0_4_8";
|
4500
4573
|
case LLAMA_FTYPE_MOSTLY_Q4_0_8_8: return "Q4_0_8_8";
|
@@ -4945,7 +5018,7 @@ static void llm_load_hparams(
|
|
4945
5018
|
{
|
4946
5019
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
4947
5020
|
switch (hparams.n_layer) {
|
4948
|
-
case 42: model.type = e_model::
|
5021
|
+
case 42: model.type = e_model::MODEL_7B; break;
|
4949
5022
|
default: model.type = e_model::MODEL_UNKNOWN;
|
4950
5023
|
}
|
4951
5024
|
} break;
|
@@ -5307,6 +5380,7 @@ static void llm_load_vocab(
|
|
5307
5380
|
if (merges_keyidx == -1) {
|
5308
5381
|
throw std::runtime_error("cannot find tokenizer merges in model file\n");
|
5309
5382
|
}
|
5383
|
+
|
5310
5384
|
const int n_merges = lm_gguf_get_arr_n(ctx, merges_keyidx);
|
5311
5385
|
for (int i = 0; i < n_merges; i++) {
|
5312
5386
|
const std::string word = lm_gguf_get_arr_str(ctx, merges_keyidx, i);
|
@@ -5345,16 +5419,6 @@ static void llm_load_vocab(
|
|
5345
5419
|
vocab.special_cls_id = -1;
|
5346
5420
|
vocab.special_mask_id = -1;
|
5347
5421
|
|
5348
|
-
const int add_space_prefix_keyidx = lm_gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_ADD_PREFIX).c_str());
|
5349
|
-
if (add_space_prefix_keyidx != -1) {
|
5350
|
-
vocab.tokenizer_add_space_prefix = lm_gguf_get_val_bool(ctx, add_space_prefix_keyidx);
|
5351
|
-
} // The default value of add_space_prefix is true.
|
5352
|
-
|
5353
|
-
const int remove_extra_whitespaces_keyidx = lm_gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_REMOVE_EXTRA_WS).c_str());
|
5354
|
-
if (remove_extra_whitespaces_keyidx != -1) {
|
5355
|
-
vocab.tokenizer_remove_extra_whitespaces = lm_gguf_get_val_bool(ctx, remove_extra_whitespaces_keyidx);
|
5356
|
-
} // The default value of remove_extra_whitespaces is false.
|
5357
|
-
|
5358
5422
|
const int precompiled_charsmap_keyidx = lm_gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_PRECOMPILED_CHARSMAP).c_str());
|
5359
5423
|
if (precompiled_charsmap_keyidx != -1) {
|
5360
5424
|
size_t n_precompiled_charsmap = lm_gguf_get_arr_n(ctx, precompiled_charsmap_keyidx);
|
@@ -5462,6 +5526,19 @@ static void llm_load_vocab(
|
|
5462
5526
|
} else if (
|
5463
5527
|
tokenizer_pre == "jais") {
|
5464
5528
|
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_JAIS;
|
5529
|
+
} else if (
|
5530
|
+
tokenizer_pre == "tekken") {
|
5531
|
+
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_TEKKEN;
|
5532
|
+
vocab.tokenizer_clean_spaces = false;
|
5533
|
+
vocab.tokenizer_ignore_merges = true;
|
5534
|
+
vocab.tokenizer_add_bos = true;
|
5535
|
+
} else if (
|
5536
|
+
tokenizer_pre == "smollm") {
|
5537
|
+
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_SMOLLM;
|
5538
|
+
vocab.tokenizer_clean_spaces = false;
|
5539
|
+
} else if (
|
5540
|
+
tokenizer_pre == "codeshell") {
|
5541
|
+
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_CODESHELL;
|
5465
5542
|
} else {
|
5466
5543
|
throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
|
5467
5544
|
}
|
@@ -5485,10 +5562,8 @@ static void llm_load_vocab(
|
|
5485
5562
|
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
|
5486
5563
|
}
|
5487
5564
|
|
5488
|
-
|
5489
|
-
|
5490
|
-
vocab.tokenizer_add_space_prefix = lm_gguf_get_val_bool(ctx, add_space_prefix_keyidx);
|
5491
|
-
}
|
5565
|
+
ml.get_key(LLM_KV_TOKENIZER_ADD_PREFIX, vocab.tokenizer_add_space_prefix, false);
|
5566
|
+
ml.get_key(LLM_KV_TOKENIZER_REMOVE_EXTRA_WS, vocab.tokenizer_remove_extra_whitespaces, false);
|
5492
5567
|
}
|
5493
5568
|
|
5494
5569
|
const int token_idx = lm_gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_LIST).c_str());
|
@@ -6069,10 +6144,10 @@ static bool llm_load_tensors(
|
|
6069
6144
|
|
6070
6145
|
layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
|
6071
6146
|
|
6072
|
-
layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd,
|
6073
|
-
layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd,
|
6074
|
-
layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd,
|
6075
|
-
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {
|
6147
|
+
layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head});
|
6148
|
+
layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa});
|
6149
|
+
layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa});
|
6150
|
+
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd});
|
6076
6151
|
|
6077
6152
|
// optional bias tensors
|
6078
6153
|
layer.bq = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
@@ -7820,6 +7895,58 @@ static void llm_build_kv_store(
|
|
7820
7895
|
lm_ggml_build_forward_expand(graph, lm_ggml_cpy(ctx, v_cur, v_cache_view));
|
7821
7896
|
}
|
7822
7897
|
|
7898
|
+
// do mat_mul, while optionally apply lora
|
7899
|
+
static struct lm_ggml_tensor * llm_build_lora_mm(
|
7900
|
+
struct llama_context & lctx,
|
7901
|
+
struct lm_ggml_context * ctx0,
|
7902
|
+
struct lm_ggml_tensor * w,
|
7903
|
+
struct lm_ggml_tensor * cur) {
|
7904
|
+
struct lm_ggml_tensor * res = lm_ggml_mul_mat(ctx0, w, cur);
|
7905
|
+
for (auto & it : lctx.lora_adapters) {
|
7906
|
+
struct llama_lora_weight * lora = it.first->get_weight(w);
|
7907
|
+
if (lora == nullptr) {
|
7908
|
+
continue;
|
7909
|
+
}
|
7910
|
+
const float alpha = it.first->alpha;
|
7911
|
+
const float rank = (float) lora->b->ne[0];
|
7912
|
+
const float scale = alpha ? it.second * alpha / rank : it.second;
|
7913
|
+
struct lm_ggml_tensor * ab_cur = lm_ggml_mul_mat(
|
7914
|
+
ctx0, lora->b,
|
7915
|
+
lm_ggml_mul_mat(ctx0, lora->a, cur)
|
7916
|
+
);
|
7917
|
+
ab_cur = lm_ggml_scale(ctx0, ab_cur, scale);
|
7918
|
+
res = lm_ggml_add(ctx0, res, ab_cur);
|
7919
|
+
}
|
7920
|
+
return res;
|
7921
|
+
}
|
7922
|
+
|
7923
|
+
// do mat_mul_id, while optionally apply lora
|
7924
|
+
static struct lm_ggml_tensor * llm_build_lora_mm_id(
|
7925
|
+
struct llama_context & lctx,
|
7926
|
+
struct lm_ggml_context * ctx0,
|
7927
|
+
struct lm_ggml_tensor * w, // struct lm_ggml_tensor * as
|
7928
|
+
struct lm_ggml_tensor * cur, // struct lm_ggml_tensor * b
|
7929
|
+
struct lm_ggml_tensor * ids) {
|
7930
|
+
struct lm_ggml_tensor * res = lm_ggml_mul_mat_id(ctx0, w, cur, ids);
|
7931
|
+
for (auto & it : lctx.lora_adapters) {
|
7932
|
+
struct llama_lora_weight * lora = it.first->get_weight(w);
|
7933
|
+
if (lora == nullptr) {
|
7934
|
+
continue;
|
7935
|
+
}
|
7936
|
+
const float alpha = it.first->alpha;
|
7937
|
+
const float rank = (float) lora->b->ne[0];
|
7938
|
+
const float scale = alpha ? it.second * alpha / rank : it.second;
|
7939
|
+
struct lm_ggml_tensor * ab_cur = lm_ggml_mul_mat_id(
|
7940
|
+
ctx0, lora->b,
|
7941
|
+
lm_ggml_mul_mat_id(ctx0, lora->a, cur, ids),
|
7942
|
+
ids
|
7943
|
+
);
|
7944
|
+
ab_cur = lm_ggml_scale(ctx0, ab_cur, scale);
|
7945
|
+
res = lm_ggml_add(ctx0, res, ab_cur);
|
7946
|
+
}
|
7947
|
+
return res;
|
7948
|
+
}
|
7949
|
+
|
7823
7950
|
static struct lm_ggml_tensor * llm_build_norm(
|
7824
7951
|
struct lm_ggml_context * ctx,
|
7825
7952
|
struct lm_ggml_tensor * cur,
|
@@ -7854,6 +7981,7 @@ static struct lm_ggml_tensor * llm_build_norm(
|
|
7854
7981
|
|
7855
7982
|
static struct lm_ggml_tensor * llm_build_ffn(
|
7856
7983
|
struct lm_ggml_context * ctx,
|
7984
|
+
struct llama_context & lctx,
|
7857
7985
|
struct lm_ggml_tensor * cur,
|
7858
7986
|
struct lm_ggml_tensor * up,
|
7859
7987
|
struct lm_ggml_tensor * up_b,
|
@@ -7869,7 +7997,7 @@ static struct lm_ggml_tensor * llm_build_ffn(
|
|
7869
7997
|
llm_ffn_gate_type type_gate,
|
7870
7998
|
const llm_build_cb & cb,
|
7871
7999
|
int il) {
|
7872
|
-
struct lm_ggml_tensor * tmp = up ?
|
8000
|
+
struct lm_ggml_tensor * tmp = up ? llm_build_lora_mm(lctx, ctx, up, cur) : cur;
|
7873
8001
|
cb(tmp, "ffn_up", il);
|
7874
8002
|
|
7875
8003
|
if (up_b) {
|
@@ -7886,12 +8014,12 @@ static struct lm_ggml_tensor * llm_build_ffn(
|
|
7886
8014
|
switch (type_gate) {
|
7887
8015
|
case LLM_FFN_SEQ:
|
7888
8016
|
{
|
7889
|
-
cur =
|
8017
|
+
cur = llm_build_lora_mm(lctx, ctx, gate, tmp);
|
7890
8018
|
cb(cur, "ffn_gate", il);
|
7891
8019
|
} break;
|
7892
8020
|
case LLM_FFN_PAR:
|
7893
8021
|
{
|
7894
|
-
cur =
|
8022
|
+
cur = llm_build_lora_mm(lctx, ctx, gate, cur);
|
7895
8023
|
cb(cur, "ffn_gate", il);
|
7896
8024
|
} break;
|
7897
8025
|
}
|
@@ -7959,7 +8087,7 @@ static struct lm_ggml_tensor * llm_build_ffn(
|
|
7959
8087
|
}
|
7960
8088
|
|
7961
8089
|
if (down) {
|
7962
|
-
cur =
|
8090
|
+
cur = llm_build_lora_mm(lctx, ctx, down, cur);
|
7963
8091
|
}
|
7964
8092
|
|
7965
8093
|
if (down_b) {
|
@@ -7980,6 +8108,7 @@ static struct lm_ggml_tensor * llm_build_ffn(
|
|
7980
8108
|
|
7981
8109
|
static struct lm_ggml_tensor * llm_build_moe_ffn(
|
7982
8110
|
struct lm_ggml_context * ctx,
|
8111
|
+
struct llama_context & lctx,
|
7983
8112
|
struct lm_ggml_tensor * cur,
|
7984
8113
|
struct lm_ggml_tensor * gate_inp,
|
7985
8114
|
struct lm_ggml_tensor * up_exps,
|
@@ -7996,7 +8125,7 @@ static struct lm_ggml_tensor * llm_build_moe_ffn(
|
|
7996
8125
|
int64_t n_embd = cur->ne[0];
|
7997
8126
|
int64_t n_tokens = cur->ne[1];
|
7998
8127
|
|
7999
|
-
lm_ggml_tensor * logits =
|
8128
|
+
lm_ggml_tensor * logits = llm_build_lora_mm(lctx, ctx, gate_inp, cur); // [n_expert, n_tokens]
|
8000
8129
|
cb(logits, "ffn_moe_logits", il);
|
8001
8130
|
|
8002
8131
|
lm_ggml_tensor * probs = lm_ggml_soft_max(ctx, logits); // [n_expert, n_tokens]
|
@@ -8028,10 +8157,10 @@ static struct lm_ggml_tensor * llm_build_moe_ffn(
|
|
8028
8157
|
}
|
8029
8158
|
|
8030
8159
|
cur = lm_ggml_reshape_3d(ctx, cur, n_embd, 1, n_tokens);
|
8031
|
-
lm_ggml_tensor * up =
|
8160
|
+
lm_ggml_tensor * up = llm_build_lora_mm_id(lctx, ctx, up_exps, cur, selected_experts); // [n_ff, n_expert_used, n_tokens]
|
8032
8161
|
cb(up, "ffn_moe_up", il);
|
8033
8162
|
|
8034
|
-
lm_ggml_tensor * gate =
|
8163
|
+
lm_ggml_tensor * gate = llm_build_lora_mm_id(lctx, ctx, gate_exps, cur, selected_experts); // [n_ff, n_expert_used, n_tokens]
|
8035
8164
|
cb(gate, "ffn_moe_gate", il);
|
8036
8165
|
|
8037
8166
|
switch (type_op) {
|
@@ -8052,7 +8181,7 @@ static struct lm_ggml_tensor * llm_build_moe_ffn(
|
|
8052
8181
|
lm_ggml_tensor * par = lm_ggml_mul(ctx, up, gate); // [n_ff, n_expert_used, n_tokens]
|
8053
8182
|
cb(par, "ffn_moe_gate_par", il);
|
8054
8183
|
|
8055
|
-
lm_ggml_tensor * experts =
|
8184
|
+
lm_ggml_tensor * experts = llm_build_lora_mm_id(lctx, ctx, down_exps, par, selected_experts); // [n_embd, n_expert_used, n_tokens]
|
8056
8185
|
cb(experts, "ffn_moe_down", il);
|
8057
8186
|
|
8058
8187
|
experts = lm_ggml_mul(ctx, experts, weights);
|
@@ -8080,9 +8209,7 @@ static struct lm_ggml_tensor * llm_build_moe_ffn(
|
|
8080
8209
|
|
8081
8210
|
static struct lm_ggml_tensor * llm_build_kqv(
|
8082
8211
|
struct lm_ggml_context * ctx,
|
8083
|
-
|
8084
|
-
const llama_hparams & hparams,
|
8085
|
-
const llama_cparams & cparams,
|
8212
|
+
struct llama_context & lctx,
|
8086
8213
|
const llama_kv_cache & kv,
|
8087
8214
|
struct lm_ggml_cgraph * graph,
|
8088
8215
|
struct lm_ggml_tensor * wo,
|
@@ -8094,6 +8221,10 @@ static struct lm_ggml_tensor * llm_build_kqv(
|
|
8094
8221
|
float kq_scale,
|
8095
8222
|
const llm_build_cb & cb,
|
8096
8223
|
int il) {
|
8224
|
+
const llama_model & model = lctx.model;
|
8225
|
+
const llama_hparams & hparams = lctx.model.hparams;
|
8226
|
+
const llama_cparams & cparams = lctx.cparams;
|
8227
|
+
|
8097
8228
|
const int64_t n_ctx = cparams.n_ctx;
|
8098
8229
|
const int64_t n_head = hparams.n_head(il);
|
8099
8230
|
const int64_t n_head_kv = hparams.n_head_kv(il);
|
@@ -8192,7 +8323,7 @@ static struct lm_ggml_tensor * llm_build_kqv(
|
|
8192
8323
|
lm_ggml_build_forward_expand(graph, cur);
|
8193
8324
|
|
8194
8325
|
if (wo) {
|
8195
|
-
cur =
|
8326
|
+
cur = llm_build_lora_mm(lctx, ctx, wo, cur);
|
8196
8327
|
}
|
8197
8328
|
|
8198
8329
|
if (wo_b) {
|
@@ -8208,9 +8339,7 @@ static struct lm_ggml_tensor * llm_build_kqv(
|
|
8208
8339
|
|
8209
8340
|
static struct lm_ggml_tensor * llm_build_kv(
|
8210
8341
|
struct lm_ggml_context * ctx,
|
8211
|
-
|
8212
|
-
const llama_hparams & hparams,
|
8213
|
-
const llama_cparams & cparams,
|
8342
|
+
struct llama_context & lctx,
|
8214
8343
|
const llama_kv_cache & kv,
|
8215
8344
|
struct lm_ggml_cgraph * graph,
|
8216
8345
|
struct lm_ggml_tensor * wo,
|
@@ -8225,6 +8354,8 @@ static struct lm_ggml_tensor * llm_build_kv(
|
|
8225
8354
|
float kq_scale,
|
8226
8355
|
const llm_build_cb & cb,
|
8227
8356
|
int il) {
|
8357
|
+
const llama_hparams & hparams = lctx.model.hparams;
|
8358
|
+
const llama_cparams & cparams = lctx.cparams;
|
8228
8359
|
|
8229
8360
|
// these nodes are added to the graph together so that they are not reordered
|
8230
8361
|
// by doing so, the number of splits in the graph is reduced
|
@@ -8236,7 +8367,7 @@ static struct lm_ggml_tensor * llm_build_kv(
|
|
8236
8367
|
|
8237
8368
|
struct lm_ggml_tensor * cur;
|
8238
8369
|
|
8239
|
-
cur = llm_build_kqv(ctx,
|
8370
|
+
cur = llm_build_kqv(ctx, lctx, kv, graph, wo, wo_b,
|
8240
8371
|
q_cur, kq_mask, n_tokens, n_kv, kq_scale, cb, il);
|
8241
8372
|
cb(cur, "kqv_out", il);
|
8242
8373
|
|
@@ -8698,21 +8829,21 @@ struct llm_build_context {
|
|
8698
8829
|
// self-attention
|
8699
8830
|
{
|
8700
8831
|
// compute Q and K and RoPE them
|
8701
|
-
struct lm_ggml_tensor * Qcur =
|
8832
|
+
struct lm_ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
|
8702
8833
|
cb(Qcur, "Qcur", il);
|
8703
8834
|
if (model.layers[il].bq) {
|
8704
8835
|
Qcur = lm_ggml_add(ctx0, Qcur, model.layers[il].bq);
|
8705
8836
|
cb(Qcur, "Qcur", il);
|
8706
8837
|
}
|
8707
8838
|
|
8708
|
-
struct lm_ggml_tensor * Kcur =
|
8839
|
+
struct lm_ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
|
8709
8840
|
cb(Kcur, "Kcur", il);
|
8710
8841
|
if (model.layers[il].bk) {
|
8711
8842
|
Kcur = lm_ggml_add(ctx0, Kcur, model.layers[il].bk);
|
8712
8843
|
cb(Kcur, "Kcur", il);
|
8713
8844
|
}
|
8714
8845
|
|
8715
|
-
struct lm_ggml_tensor * Vcur =
|
8846
|
+
struct lm_ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
|
8716
8847
|
cb(Vcur, "Vcur", il);
|
8717
8848
|
if (model.layers[il].bv) {
|
8718
8849
|
Vcur = lm_ggml_add(ctx0, Vcur, model.layers[il].bv);
|
@@ -8733,7 +8864,7 @@ struct llm_build_context {
|
|
8733
8864
|
);
|
8734
8865
|
cb(Kcur, "Kcur", il);
|
8735
8866
|
|
8736
|
-
cur = llm_build_kv(ctx0,
|
8867
|
+
cur = llm_build_kv(ctx0, lctx, kv_self, gf,
|
8737
8868
|
model.layers[il].wo, model.layers[il].bo,
|
8738
8869
|
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
8739
8870
|
}
|
@@ -8756,7 +8887,7 @@ struct llm_build_context {
|
|
8756
8887
|
LLM_NORM_RMS, cb, il);
|
8757
8888
|
cb(cur, "ffn_norm", il);
|
8758
8889
|
|
8759
|
-
cur = llm_build_ffn(ctx0, cur,
|
8890
|
+
cur = llm_build_ffn(ctx0, lctx, cur,
|
8760
8891
|
model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
|
8761
8892
|
model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL,
|
8762
8893
|
model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
|
@@ -8770,7 +8901,7 @@ struct llm_build_context {
|
|
8770
8901
|
LLM_NORM_RMS, cb, il);
|
8771
8902
|
cb(cur, "ffn_norm", il);
|
8772
8903
|
|
8773
|
-
cur = llm_build_moe_ffn(ctx0, cur,
|
8904
|
+
cur = llm_build_moe_ffn(ctx0, lctx, cur,
|
8774
8905
|
model.layers[il].ffn_gate_inp,
|
8775
8906
|
model.layers[il].ffn_up_exps,
|
8776
8907
|
model.layers[il].ffn_gate_exps,
|
@@ -8800,7 +8931,7 @@ struct llm_build_context {
|
|
8800
8931
|
cb(cur, "result_norm", -1);
|
8801
8932
|
|
8802
8933
|
// lm_head
|
8803
|
-
cur =
|
8934
|
+
cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
|
8804
8935
|
cb(cur, "result_output", -1);
|
8805
8936
|
|
8806
8937
|
lm_ggml_build_forward_expand(gf, cur);
|
@@ -8836,13 +8967,13 @@ struct llm_build_context {
|
|
8836
8967
|
|
8837
8968
|
// self-attention
|
8838
8969
|
{
|
8839
|
-
struct lm_ggml_tensor * Qcur =
|
8970
|
+
struct lm_ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
|
8840
8971
|
cb(Qcur, "Qcur", il);
|
8841
8972
|
|
8842
|
-
struct lm_ggml_tensor * Kcur =
|
8973
|
+
struct lm_ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
|
8843
8974
|
cb(Kcur, "Kcur", il);
|
8844
8975
|
|
8845
|
-
struct lm_ggml_tensor * Vcur =
|
8976
|
+
struct lm_ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
|
8846
8977
|
cb(Vcur, "Vcur", il);
|
8847
8978
|
|
8848
8979
|
switch (model.type) {
|
@@ -8868,7 +8999,7 @@ struct llm_build_context {
|
|
8868
8999
|
cb(Qcur, "Qcur", il);
|
8869
9000
|
cb(Kcur, "Kcur", il);
|
8870
9001
|
|
8871
|
-
cur = llm_build_kv(ctx0,
|
9002
|
+
cur = llm_build_kv(ctx0, lctx, kv_self, gf,
|
8872
9003
|
model.layers[il].wo, NULL,
|
8873
9004
|
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
8874
9005
|
}
|
@@ -8890,7 +9021,7 @@ struct llm_build_context {
|
|
8890
9021
|
LLM_NORM_RMS, cb, il);
|
8891
9022
|
cb(cur, "ffn_norm", il);
|
8892
9023
|
|
8893
|
-
cur = llm_build_ffn(ctx0, cur,
|
9024
|
+
cur = llm_build_ffn(ctx0, lctx, cur,
|
8894
9025
|
model.layers[il].ffn_up, NULL, NULL,
|
8895
9026
|
model.layers[il].ffn_gate, NULL, NULL,
|
8896
9027
|
model.layers[il].ffn_down, NULL, NULL,
|
@@ -8915,7 +9046,7 @@ struct llm_build_context {
|
|
8915
9046
|
cb(cur, "result_norm", -1);
|
8916
9047
|
|
8917
9048
|
// lm_head
|
8918
|
-
cur =
|
9049
|
+
cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
|
8919
9050
|
cb(cur, "result_output", -1);
|
8920
9051
|
|
8921
9052
|
lm_ggml_build_forward_expand(gf, cur);
|
@@ -8951,13 +9082,13 @@ struct llm_build_context {
|
|
8951
9082
|
|
8952
9083
|
// self-attention
|
8953
9084
|
{
|
8954
|
-
struct lm_ggml_tensor * Qcur =
|
9085
|
+
struct lm_ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
|
8955
9086
|
cb(Qcur, "Qcur", il);
|
8956
9087
|
|
8957
|
-
struct lm_ggml_tensor * Kcur =
|
9088
|
+
struct lm_ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
|
8958
9089
|
cb(Kcur, "Kcur", il);
|
8959
9090
|
|
8960
|
-
struct lm_ggml_tensor * Vcur =
|
9091
|
+
struct lm_ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
|
8961
9092
|
cb(Vcur, "Vcur", il);
|
8962
9093
|
|
8963
9094
|
Qcur = lm_ggml_rope_ext(
|
@@ -8973,7 +9104,7 @@ struct llm_build_context {
|
|
8973
9104
|
ext_factor, attn_factor, beta_fast, beta_slow
|
8974
9105
|
);
|
8975
9106
|
cb(Kcur, "Kcur", il);
|
8976
|
-
cur = llm_build_kv(ctx0,
|
9107
|
+
cur = llm_build_kv(ctx0, lctx, kv_self, gf,
|
8977
9108
|
model.layers[il].wo, NULL,
|
8978
9109
|
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
8979
9110
|
}
|
@@ -8995,7 +9126,7 @@ struct llm_build_context {
|
|
8995
9126
|
LLM_NORM_RMS, cb, il);
|
8996
9127
|
cb(cur, "ffn_norm", il);
|
8997
9128
|
|
8998
|
-
cur = llm_build_ffn(ctx0, cur,
|
9129
|
+
cur = llm_build_ffn(ctx0, lctx, cur,
|
8999
9130
|
model.layers[il].ffn_up, NULL, NULL,
|
9000
9131
|
model.layers[il].ffn_gate, NULL, NULL,
|
9001
9132
|
model.layers[il].ffn_down, NULL, NULL,
|
@@ -9018,7 +9149,7 @@ struct llm_build_context {
|
|
9018
9149
|
cb(cur, "result_norm", -1);
|
9019
9150
|
|
9020
9151
|
// lm_head
|
9021
|
-
cur =
|
9152
|
+
cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
|
9022
9153
|
cb(cur, "result_output", -1);
|
9023
9154
|
|
9024
9155
|
lm_ggml_build_forward_expand(gf, cur);
|
@@ -9067,7 +9198,7 @@ struct llm_build_context {
|
|
9067
9198
|
cur = attn_norm;
|
9068
9199
|
}
|
9069
9200
|
|
9070
|
-
cur =
|
9201
|
+
cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wqkv, cur);
|
9071
9202
|
cb(cur, "wqkv", il);
|
9072
9203
|
|
9073
9204
|
struct lm_ggml_tensor * Qcur = lm_ggml_cont(ctx0, lm_ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
|
@@ -9094,7 +9225,7 @@ struct llm_build_context {
|
|
9094
9225
|
);
|
9095
9226
|
cb(Kcur, "Kcur", il);
|
9096
9227
|
|
9097
|
-
cur = llm_build_kv(ctx0,
|
9228
|
+
cur = llm_build_kv(ctx0, lctx, kv_self, gf,
|
9098
9229
|
model.layers[il].wo, NULL,
|
9099
9230
|
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
9100
9231
|
}
|
@@ -9111,7 +9242,7 @@ struct llm_build_context {
|
|
9111
9242
|
|
9112
9243
|
// feed forward
|
9113
9244
|
{
|
9114
|
-
cur = llm_build_ffn(ctx0, attn_norm, // !! use the attn norm, not the result
|
9245
|
+
cur = llm_build_ffn(ctx0, lctx, attn_norm, // !! use the attn norm, not the result
|
9115
9246
|
model.layers[il].ffn_up, NULL, NULL,
|
9116
9247
|
NULL, NULL, NULL,
|
9117
9248
|
model.layers[il].ffn_down, NULL, NULL,
|
@@ -9138,7 +9269,7 @@ struct llm_build_context {
|
|
9138
9269
|
LLM_NORM, cb, -1);
|
9139
9270
|
cb(cur, "result_norm", -1);
|
9140
9271
|
|
9141
|
-
cur =
|
9272
|
+
cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
|
9142
9273
|
cb(cur, "result_output", -1);
|
9143
9274
|
|
9144
9275
|
lm_ggml_build_forward_expand(gf, cur);
|
@@ -9183,21 +9314,21 @@ struct llm_build_context {
|
|
9183
9314
|
// self-attention
|
9184
9315
|
{
|
9185
9316
|
// compute Q and K and RoPE them
|
9186
|
-
struct lm_ggml_tensor * Qcur =
|
9317
|
+
struct lm_ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
|
9187
9318
|
cb(Qcur, "Qcur", il);
|
9188
9319
|
if (model.layers[il].bq) {
|
9189
9320
|
Qcur = lm_ggml_add(ctx0, Qcur, model.layers[il].bq);
|
9190
9321
|
cb(Qcur, "Qcur", il);
|
9191
9322
|
}
|
9192
9323
|
|
9193
|
-
struct lm_ggml_tensor * Kcur =
|
9324
|
+
struct lm_ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
|
9194
9325
|
cb(Kcur, "Kcur", il);
|
9195
9326
|
if (model.layers[il].bk) {
|
9196
9327
|
Kcur = lm_ggml_add(ctx0, Kcur, model.layers[il].bk);
|
9197
9328
|
cb(Kcur, "Kcur", il);
|
9198
9329
|
}
|
9199
9330
|
|
9200
|
-
struct lm_ggml_tensor * Vcur =
|
9331
|
+
struct lm_ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
|
9201
9332
|
cb(Vcur, "Vcur", il);
|
9202
9333
|
if (model.layers[il].bv) {
|
9203
9334
|
Vcur = lm_ggml_add(ctx0, Vcur, model.layers[il].bv);
|
@@ -9218,7 +9349,7 @@ struct llm_build_context {
|
|
9218
9349
|
);
|
9219
9350
|
cb(Kcur, "Kcur", il);
|
9220
9351
|
|
9221
|
-
cur = llm_build_kv(ctx0,
|
9352
|
+
cur = llm_build_kv(ctx0, lctx, kv_self, gf,
|
9222
9353
|
model.layers[il].wo, model.layers[il].bo,
|
9223
9354
|
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f, cb, il);
|
9224
9355
|
}
|
@@ -9250,7 +9381,7 @@ struct llm_build_context {
|
|
9250
9381
|
LLM_NORM_RMS, cb, il);
|
9251
9382
|
cb(cur, "ffn_norm", il);
|
9252
9383
|
|
9253
|
-
cur = llm_build_moe_ffn(ctx0, cur,
|
9384
|
+
cur = llm_build_moe_ffn(ctx0, lctx, cur,
|
9254
9385
|
model.layers[il].ffn_gate_inp,
|
9255
9386
|
model.layers[il].ffn_up_exps,
|
9256
9387
|
model.layers[il].ffn_gate_exps,
|
@@ -9289,7 +9420,7 @@ struct llm_build_context {
|
|
9289
9420
|
cb(cur, "result_norm", -1);
|
9290
9421
|
|
9291
9422
|
// lm_head
|
9292
|
-
cur =
|
9423
|
+
cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
|
9293
9424
|
|
9294
9425
|
// Grok
|
9295
9426
|
// multiply logits by output_multiplier_scale of 0.5773502691896257
|
@@ -9340,7 +9471,7 @@ struct llm_build_context {
|
|
9340
9471
|
struct lm_ggml_tensor * Kcur = nullptr;
|
9341
9472
|
struct lm_ggml_tensor * Vcur = nullptr;
|
9342
9473
|
|
9343
|
-
cur =
|
9474
|
+
cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wqkv, cur);
|
9344
9475
|
cb(cur, "wqkv", il);
|
9345
9476
|
|
9346
9477
|
cur = lm_ggml_clamp(ctx0, cur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv);
|
@@ -9368,7 +9499,7 @@ struct llm_build_context {
|
|
9368
9499
|
);
|
9369
9500
|
cb(Kcur, "Kcur", il);
|
9370
9501
|
|
9371
|
-
cur = llm_build_kv(ctx0,
|
9502
|
+
cur = llm_build_kv(ctx0, lctx, kv_self, gf,
|
9372
9503
|
model.layers[il].wo, NULL,
|
9373
9504
|
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
9374
9505
|
}
|
@@ -9391,7 +9522,7 @@ struct llm_build_context {
|
|
9391
9522
|
LLM_NORM, cb, il);
|
9392
9523
|
cb(cur, "attn_out_norm", il);
|
9393
9524
|
|
9394
|
-
cur = llm_build_moe_ffn(ctx0, cur,
|
9525
|
+
cur = llm_build_moe_ffn(ctx0, lctx, cur,
|
9395
9526
|
model.layers[il].ffn_gate_inp,
|
9396
9527
|
model.layers[il].ffn_up_exps,
|
9397
9528
|
model.layers[il].ffn_gate_exps,
|
@@ -9420,7 +9551,7 @@ struct llm_build_context {
|
|
9420
9551
|
cb(cur, "result_norm", -1);
|
9421
9552
|
|
9422
9553
|
// lm_head
|
9423
|
-
cur =
|
9554
|
+
cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
|
9424
9555
|
|
9425
9556
|
cb(cur, "result_output", -1);
|
9426
9557
|
|
@@ -9462,7 +9593,7 @@ struct llm_build_context {
|
|
9462
9593
|
|
9463
9594
|
// self-attention
|
9464
9595
|
{
|
9465
|
-
cur =
|
9596
|
+
cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wqkv, cur);
|
9466
9597
|
cb(cur, "wqkv", il);
|
9467
9598
|
|
9468
9599
|
cur = lm_ggml_add(ctx0, cur, model.layers[il].bqkv);
|
@@ -9478,7 +9609,7 @@ struct llm_build_context {
|
|
9478
9609
|
|
9479
9610
|
Qcur = lm_ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
9480
9611
|
|
9481
|
-
cur = llm_build_kv(ctx0,
|
9612
|
+
cur = llm_build_kv(ctx0, lctx, kv_self, gf,
|
9482
9613
|
model.layers[il].wo, model.layers[il].bo,
|
9483
9614
|
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
9484
9615
|
}
|
@@ -9502,7 +9633,7 @@ struct llm_build_context {
|
|
9502
9633
|
LLM_NORM, cb, il);
|
9503
9634
|
cb(cur, "ffn_norm", il);
|
9504
9635
|
|
9505
|
-
cur = llm_build_ffn(ctx0, cur,
|
9636
|
+
cur = llm_build_ffn(ctx0, lctx, cur,
|
9506
9637
|
model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
|
9507
9638
|
NULL, NULL, NULL,
|
9508
9639
|
model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
|
@@ -9525,7 +9656,7 @@ struct llm_build_context {
|
|
9525
9656
|
LLM_NORM, cb, -1);
|
9526
9657
|
cb(cur, "result_norm", -1);
|
9527
9658
|
|
9528
|
-
cur =
|
9659
|
+
cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
|
9529
9660
|
cb(cur, "result_output", -1);
|
9530
9661
|
|
9531
9662
|
lm_ggml_build_forward_expand(gf, cur);
|
@@ -9557,13 +9688,13 @@ struct llm_build_context {
|
|
9557
9688
|
|
9558
9689
|
// self-attention
|
9559
9690
|
{
|
9560
|
-
struct lm_ggml_tensor * Qcur =
|
9691
|
+
struct lm_ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
|
9561
9692
|
cb(Qcur, "Qcur", il);
|
9562
9693
|
|
9563
|
-
struct lm_ggml_tensor * Kcur =
|
9694
|
+
struct lm_ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
|
9564
9695
|
cb(Kcur, "Kcur", il);
|
9565
9696
|
|
9566
|
-
struct lm_ggml_tensor * Vcur =
|
9697
|
+
struct lm_ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
|
9567
9698
|
cb(Vcur, "Vcur", il);
|
9568
9699
|
|
9569
9700
|
Kcur = lm_ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
@@ -9572,7 +9703,7 @@ struct llm_build_context {
|
|
9572
9703
|
Qcur = lm_ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
9573
9704
|
cb(Qcur, "Qcur", il);
|
9574
9705
|
|
9575
|
-
cur = llm_build_kv(ctx0,
|
9706
|
+
cur = llm_build_kv(ctx0, lctx, kv_self, gf,
|
9576
9707
|
model.layers[il].wo, NULL,
|
9577
9708
|
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
9578
9709
|
}
|
@@ -9594,7 +9725,7 @@ struct llm_build_context {
|
|
9594
9725
|
LLM_NORM_RMS, cb, il);
|
9595
9726
|
cb(cur, "ffn_norm", il);
|
9596
9727
|
|
9597
|
-
cur = llm_build_ffn(ctx0, cur,
|
9728
|
+
cur = llm_build_ffn(ctx0, lctx, cur,
|
9598
9729
|
model.layers[il].ffn_up, NULL, NULL,
|
9599
9730
|
model.layers[il].ffn_gate, NULL, NULL,
|
9600
9731
|
model.layers[il].ffn_down, NULL, NULL,
|
@@ -9619,7 +9750,7 @@ struct llm_build_context {
|
|
9619
9750
|
cb(cur, "result_norm", -1);
|
9620
9751
|
|
9621
9752
|
// lm_head
|
9622
|
-
cur =
|
9753
|
+
cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
|
9623
9754
|
cb(cur, "result_output", -1);
|
9624
9755
|
|
9625
9756
|
lm_ggml_build_forward_expand(gf, cur);
|
@@ -9671,7 +9802,7 @@ struct llm_build_context {
|
|
9671
9802
|
|
9672
9803
|
// self-attention
|
9673
9804
|
if (model.arch == LLM_ARCH_BERT || model.arch == LLM_ARCH_JINA_BERT_V2) {
|
9674
|
-
Qcur = lm_ggml_add(ctx0,
|
9805
|
+
Qcur = lm_ggml_add(ctx0, llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur), model.layers[il].bq);
|
9675
9806
|
cb(Qcur, "Qcur", il);
|
9676
9807
|
|
9677
9808
|
if (model.layers[il].attn_q_norm) {
|
@@ -9681,7 +9812,7 @@ struct llm_build_context {
|
|
9681
9812
|
LLM_NORM, cb, il);
|
9682
9813
|
}
|
9683
9814
|
|
9684
|
-
Kcur = lm_ggml_add(ctx0,
|
9815
|
+
Kcur = lm_ggml_add(ctx0, llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur), model.layers[il].bk);
|
9685
9816
|
cb(Kcur, "Kcur", il);
|
9686
9817
|
|
9687
9818
|
if (model.layers[il].attn_k_norm) {
|
@@ -9690,14 +9821,14 @@ struct llm_build_context {
|
|
9690
9821
|
model.layers[il].attn_k_norm_b,
|
9691
9822
|
LLM_NORM, cb, il);
|
9692
9823
|
}
|
9693
|
-
Vcur = lm_ggml_add(ctx0,
|
9824
|
+
Vcur = lm_ggml_add(ctx0, llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur), model.layers[il].bv);
|
9694
9825
|
cb(Vcur, "Vcur", il);
|
9695
9826
|
|
9696
9827
|
Qcur = lm_ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
9697
9828
|
Kcur = lm_ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
9698
9829
|
} else {
|
9699
9830
|
// compute Q and K and RoPE them
|
9700
|
-
cur =
|
9831
|
+
cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wqkv, cur);
|
9701
9832
|
cb(cur, "wqkv", il);
|
9702
9833
|
|
9703
9834
|
Qcur = lm_ggml_cont(ctx0, lm_ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
|
@@ -9746,7 +9877,7 @@ struct llm_build_context {
|
|
9746
9877
|
|
9747
9878
|
lm_ggml_build_forward_expand(gf, cur);
|
9748
9879
|
|
9749
|
-
cur =
|
9880
|
+
cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wo, cur);
|
9750
9881
|
if (model.layers[il].bo) {
|
9751
9882
|
cb(cur, "kqv_wo", il);
|
9752
9883
|
}
|
@@ -9779,21 +9910,21 @@ struct llm_build_context {
|
|
9779
9910
|
|
9780
9911
|
// feed-forward network
|
9781
9912
|
if (model.arch == LLM_ARCH_BERT) {
|
9782
|
-
cur = llm_build_ffn(ctx0, cur,
|
9913
|
+
cur = llm_build_ffn(ctx0, lctx, cur,
|
9783
9914
|
model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
|
9784
9915
|
NULL, NULL, NULL,
|
9785
9916
|
model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
|
9786
9917
|
NULL,
|
9787
9918
|
LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
|
9788
9919
|
} else if (model.arch == LLM_ARCH_JINA_BERT_V2) {
|
9789
|
-
cur = llm_build_ffn(ctx0, cur,
|
9920
|
+
cur = llm_build_ffn(ctx0, lctx, cur,
|
9790
9921
|
model.layers[il].ffn_up, NULL, NULL,
|
9791
9922
|
model.layers[il].ffn_gate, NULL, NULL,
|
9792
9923
|
model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
|
9793
9924
|
NULL,
|
9794
9925
|
LLM_FFN_GELU, LLM_FFN_PAR, cb, il);
|
9795
9926
|
} else {
|
9796
|
-
cur = llm_build_ffn(ctx0, cur,
|
9927
|
+
cur = llm_build_ffn(ctx0, lctx, cur,
|
9797
9928
|
model.layers[il].ffn_up, NULL, NULL,
|
9798
9929
|
model.layers[il].ffn_gate, NULL, NULL,
|
9799
9930
|
model.layers[il].ffn_down, NULL, NULL,
|
@@ -9851,7 +9982,7 @@ struct llm_build_context {
|
|
9851
9982
|
|
9852
9983
|
// self-attention
|
9853
9984
|
{
|
9854
|
-
cur =
|
9985
|
+
cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wqkv, cur);
|
9855
9986
|
cb(cur, "wqkv", il);
|
9856
9987
|
|
9857
9988
|
cur = lm_ggml_add(ctx0, cur, model.layers[il].bqkv);
|
@@ -9867,7 +9998,7 @@ struct llm_build_context {
|
|
9867
9998
|
|
9868
9999
|
Qcur = lm_ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
9869
10000
|
|
9870
|
-
cur = llm_build_kv(ctx0,
|
10001
|
+
cur = llm_build_kv(ctx0, lctx, kv_self, gf,
|
9871
10002
|
model.layers[il].wo, model.layers[il].bo,
|
9872
10003
|
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
9873
10004
|
}
|
@@ -9891,7 +10022,7 @@ struct llm_build_context {
|
|
9891
10022
|
LLM_NORM, cb, il);
|
9892
10023
|
cb(cur, "ffn_norm", il);
|
9893
10024
|
|
9894
|
-
cur = llm_build_ffn(ctx0, cur,
|
10025
|
+
cur = llm_build_ffn(ctx0, lctx, cur,
|
9895
10026
|
model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
|
9896
10027
|
NULL, NULL, NULL,
|
9897
10028
|
model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
|
@@ -9914,7 +10045,7 @@ struct llm_build_context {
|
|
9914
10045
|
LLM_NORM, cb, -1);
|
9915
10046
|
cb(cur, "result_norm", -1);
|
9916
10047
|
|
9917
|
-
cur =
|
10048
|
+
cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
|
9918
10049
|
cb(cur, "result_output", -1);
|
9919
10050
|
|
9920
10051
|
lm_ggml_build_forward_expand(gf, cur);
|
@@ -9961,7 +10092,7 @@ struct llm_build_context {
|
|
9961
10092
|
{
|
9962
10093
|
cur = attn_norm;
|
9963
10094
|
|
9964
|
-
cur =
|
10095
|
+
cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wqkv, cur);
|
9965
10096
|
cb(cur, "wqkv", il);
|
9966
10097
|
|
9967
10098
|
if (model.layers[il].bqkv){
|
@@ -9999,13 +10130,13 @@ struct llm_build_context {
|
|
9999
10130
|
Qcur = lm_ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
10000
10131
|
Kcur = lm_ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
10001
10132
|
|
10002
|
-
cur = llm_build_kv(ctx0,
|
10133
|
+
cur = llm_build_kv(ctx0, lctx, kv_self, gf,
|
10003
10134
|
model.layers[il].wo, model.layers[il].bo,
|
10004
10135
|
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
10005
10136
|
} else {
|
10006
10137
|
Qcur = lm_ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
10007
10138
|
|
10008
|
-
cur = llm_build_kv(ctx0,
|
10139
|
+
cur = llm_build_kv(ctx0, lctx, kv_self, gf,
|
10009
10140
|
model.layers[il].wo, model.layers[il].bo,
|
10010
10141
|
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
10011
10142
|
}
|
@@ -10029,7 +10160,7 @@ struct llm_build_context {
|
|
10029
10160
|
model.layers[il].ffn_norm_b,
|
10030
10161
|
LLM_NORM, cb, il);
|
10031
10162
|
cb(cur, "ffn_norm", il);
|
10032
|
-
cur = llm_build_ffn(ctx0, cur,
|
10163
|
+
cur = llm_build_ffn(ctx0, lctx, cur,
|
10033
10164
|
model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
|
10034
10165
|
NULL, NULL, NULL,
|
10035
10166
|
model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
|
@@ -10054,7 +10185,7 @@ struct llm_build_context {
|
|
10054
10185
|
LLM_NORM, cb, -1);
|
10055
10186
|
cb(cur, "result_norm", -1);
|
10056
10187
|
|
10057
|
-
cur =
|
10188
|
+
cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
|
10058
10189
|
cb(cur, "result_output", -1);
|
10059
10190
|
|
10060
10191
|
lm_ggml_build_forward_expand(gf, cur);
|
@@ -10094,21 +10225,21 @@ struct llm_build_context {
|
|
10094
10225
|
// self-attention
|
10095
10226
|
{
|
10096
10227
|
// compute Q and K and RoPE them
|
10097
|
-
struct lm_ggml_tensor * Qcur =
|
10228
|
+
struct lm_ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
|
10098
10229
|
cb(Qcur, "Qcur", il);
|
10099
10230
|
if (model.layers[il].bq) {
|
10100
10231
|
Qcur = lm_ggml_add(ctx0, Qcur, model.layers[il].bq);
|
10101
10232
|
cb(Qcur, "Qcur", il);
|
10102
10233
|
}
|
10103
10234
|
|
10104
|
-
struct lm_ggml_tensor * Kcur =
|
10235
|
+
struct lm_ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
|
10105
10236
|
cb(Kcur, "Kcur", il);
|
10106
10237
|
if (model.layers[il].bk) {
|
10107
10238
|
Kcur = lm_ggml_add(ctx0, Kcur, model.layers[il].bk);
|
10108
10239
|
cb(Kcur, "Kcur", il);
|
10109
10240
|
}
|
10110
10241
|
|
10111
|
-
struct lm_ggml_tensor * Vcur =
|
10242
|
+
struct lm_ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
|
10112
10243
|
cb(Vcur, "Vcur", il);
|
10113
10244
|
if (model.layers[il].bv) {
|
10114
10245
|
Vcur = lm_ggml_add(ctx0, Vcur, model.layers[il].bv);
|
@@ -10150,7 +10281,7 @@ struct llm_build_context {
|
|
10150
10281
|
);
|
10151
10282
|
cb(Kcur, "Kcur", il);
|
10152
10283
|
|
10153
|
-
cur = llm_build_kv(ctx0,
|
10284
|
+
cur = llm_build_kv(ctx0, lctx, kv_self, gf,
|
10154
10285
|
model.layers[il].wo, NULL,
|
10155
10286
|
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
10156
10287
|
}
|
@@ -10178,7 +10309,7 @@ struct llm_build_context {
|
|
10178
10309
|
// parallel residual
|
10179
10310
|
cur = inpSA;
|
10180
10311
|
}
|
10181
|
-
cur = llm_build_ffn(ctx0, cur,
|
10312
|
+
cur = llm_build_ffn(ctx0, lctx, cur,
|
10182
10313
|
model.layers[il].ffn_up, NULL, NULL,
|
10183
10314
|
model.layers[il].ffn_gate, NULL, NULL,
|
10184
10315
|
model.layers[il].ffn_down, NULL, NULL,
|
@@ -10204,7 +10335,7 @@ struct llm_build_context {
|
|
10204
10335
|
cb(cur, "result_norm", -1);
|
10205
10336
|
|
10206
10337
|
// lm_head
|
10207
|
-
cur =
|
10338
|
+
cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
|
10208
10339
|
cb(cur, "result_output", -1);
|
10209
10340
|
|
10210
10341
|
lm_ggml_build_forward_expand(gf, cur);
|
@@ -10239,7 +10370,7 @@ struct llm_build_context {
|
|
10239
10370
|
|
10240
10371
|
// self-attention
|
10241
10372
|
{
|
10242
|
-
cur =
|
10373
|
+
cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wqkv, cur);
|
10243
10374
|
cb(cur, "wqkv", il);
|
10244
10375
|
|
10245
10376
|
cur = lm_ggml_add(ctx0, cur, model.layers[il].bqkv);
|
@@ -10269,7 +10400,7 @@ struct llm_build_context {
|
|
10269
10400
|
);
|
10270
10401
|
cb(Kcur, "Kcur", il);
|
10271
10402
|
|
10272
|
-
cur = llm_build_kv(ctx0,
|
10403
|
+
cur = llm_build_kv(ctx0, lctx, kv_self, gf,
|
10273
10404
|
model.layers[il].wo, NULL,
|
10274
10405
|
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
10275
10406
|
}
|
@@ -10291,7 +10422,7 @@ struct llm_build_context {
|
|
10291
10422
|
LLM_NORM_RMS, cb, il);
|
10292
10423
|
cb(cur, "ffn_norm", il);
|
10293
10424
|
|
10294
|
-
cur = llm_build_ffn(ctx0, cur,
|
10425
|
+
cur = llm_build_ffn(ctx0, lctx, cur,
|
10295
10426
|
model.layers[il].ffn_up, NULL, NULL,
|
10296
10427
|
model.layers[il].ffn_gate, NULL, NULL,
|
10297
10428
|
model.layers[il].ffn_down, NULL, NULL,
|
@@ -10316,7 +10447,7 @@ struct llm_build_context {
|
|
10316
10447
|
cb(cur, "result_norm", -1);
|
10317
10448
|
|
10318
10449
|
// lm_head
|
10319
|
-
cur =
|
10450
|
+
cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
|
10320
10451
|
cb(cur, "result_output", -1);
|
10321
10452
|
|
10322
10453
|
lm_ggml_build_forward_expand(gf, cur);
|
@@ -10354,17 +10485,17 @@ struct llm_build_context {
|
|
10354
10485
|
// self-attention
|
10355
10486
|
{
|
10356
10487
|
// compute Q and K and RoPE them
|
10357
|
-
struct lm_ggml_tensor * Qcur =
|
10488
|
+
struct lm_ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
|
10358
10489
|
cb(Qcur, "Qcur", il);
|
10359
10490
|
Qcur = lm_ggml_add(ctx0, Qcur, model.layers[il].bq);
|
10360
10491
|
cb(Qcur, "Qcur", il);
|
10361
10492
|
|
10362
|
-
struct lm_ggml_tensor * Kcur =
|
10493
|
+
struct lm_ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
|
10363
10494
|
cb(Kcur, "Kcur", il);
|
10364
10495
|
Kcur = lm_ggml_add(ctx0, Kcur, model.layers[il].bk);
|
10365
10496
|
cb(Kcur, "Kcur", il);
|
10366
10497
|
|
10367
|
-
struct lm_ggml_tensor * Vcur =
|
10498
|
+
struct lm_ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
|
10368
10499
|
cb(Vcur, "Vcur", il);
|
10369
10500
|
Vcur = lm_ggml_add(ctx0, Vcur, model.layers[il].bv);
|
10370
10501
|
cb(Vcur, "Vcur", il);
|
@@ -10383,7 +10514,7 @@ struct llm_build_context {
|
|
10383
10514
|
);
|
10384
10515
|
cb(Kcur, "Kcur", il);
|
10385
10516
|
|
10386
|
-
cur = llm_build_kv(ctx0,
|
10517
|
+
cur = llm_build_kv(ctx0, lctx, kv_self, gf,
|
10387
10518
|
model.layers[il].wo, model.layers[il].bo,
|
10388
10519
|
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
10389
10520
|
}
|
@@ -10404,7 +10535,7 @@ struct llm_build_context {
|
|
10404
10535
|
LLM_NORM_RMS, cb, il);
|
10405
10536
|
cb(cur, "ffn_norm", il);
|
10406
10537
|
|
10407
|
-
cur = llm_build_ffn(ctx0, cur,
|
10538
|
+
cur = llm_build_ffn(ctx0, lctx, cur,
|
10408
10539
|
model.layers[il].ffn_up, NULL, NULL,
|
10409
10540
|
model.layers[il].ffn_gate, NULL, NULL,
|
10410
10541
|
model.layers[il].ffn_down, NULL, NULL,
|
@@ -10428,7 +10559,7 @@ struct llm_build_context {
|
|
10428
10559
|
cb(cur, "result_norm", -1);
|
10429
10560
|
|
10430
10561
|
// lm_head
|
10431
|
-
cur =
|
10562
|
+
cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
|
10432
10563
|
cb(cur, "result_output", -1);
|
10433
10564
|
|
10434
10565
|
lm_ggml_build_forward_expand(gf, cur);
|
@@ -10469,17 +10600,17 @@ struct llm_build_context {
|
|
10469
10600
|
// self_attention
|
10470
10601
|
{
|
10471
10602
|
// compute Q and K and RoPE them
|
10472
|
-
struct lm_ggml_tensor * Qcur =
|
10603
|
+
struct lm_ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
|
10473
10604
|
cb(Qcur, "Qcur", il);
|
10474
10605
|
Qcur = lm_ggml_add(ctx0, Qcur, model.layers[il].bq);
|
10475
10606
|
cb(Qcur, "Qcur", il);
|
10476
10607
|
|
10477
|
-
struct lm_ggml_tensor * Kcur =
|
10608
|
+
struct lm_ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
|
10478
10609
|
cb(Kcur, "Kcur", il);
|
10479
10610
|
Kcur = lm_ggml_add(ctx0, Kcur, model.layers[il].bk);
|
10480
10611
|
cb(Kcur, "Kcur", il);
|
10481
10612
|
|
10482
|
-
struct lm_ggml_tensor * Vcur =
|
10613
|
+
struct lm_ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
|
10483
10614
|
cb(Vcur, "Vcur", il);
|
10484
10615
|
Vcur = lm_ggml_add(ctx0, Vcur, model.layers[il].bv);
|
10485
10616
|
cb(Vcur, "Vcur", il);
|
@@ -10498,7 +10629,7 @@ struct llm_build_context {
|
|
10498
10629
|
);
|
10499
10630
|
cb(Kcur, "Kcur", il);
|
10500
10631
|
|
10501
|
-
cur = llm_build_kv(ctx0,
|
10632
|
+
cur = llm_build_kv(ctx0, lctx, kv_self, gf,
|
10502
10633
|
model.layers[il].wo, model.layers[il].bo,
|
10503
10634
|
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
10504
10635
|
}
|
@@ -10521,7 +10652,7 @@ struct llm_build_context {
|
|
10521
10652
|
cb(cur, "ffn_norm", il);
|
10522
10653
|
|
10523
10654
|
lm_ggml_tensor * moe_out =
|
10524
|
-
llm_build_moe_ffn(ctx0, cur,
|
10655
|
+
llm_build_moe_ffn(ctx0, lctx, cur,
|
10525
10656
|
model.layers[il].ffn_gate_inp,
|
10526
10657
|
model.layers[il].ffn_up_exps,
|
10527
10658
|
model.layers[il].ffn_gate_exps,
|
@@ -10534,14 +10665,14 @@ struct llm_build_context {
|
|
10534
10665
|
|
10535
10666
|
// FFN shared expert
|
10536
10667
|
{
|
10537
|
-
lm_ggml_tensor * cur_gate_inp =
|
10668
|
+
lm_ggml_tensor * cur_gate_inp = llm_build_lora_mm(lctx, ctx0, model.layers[il].ffn_gate_inp_shexp, cur);
|
10538
10669
|
cb(cur_gate_inp, "ffn_shexp_gate_inp", il);
|
10539
10670
|
|
10540
10671
|
// sigmoid
|
10541
10672
|
lm_ggml_tensor * cur_gate = lm_ggml_div(ctx0, lm_ggml_silu(ctx0, cur_gate_inp), cur_gate_inp);
|
10542
10673
|
cb(cur_gate, "ffn_shexp_gate", il);
|
10543
10674
|
|
10544
|
-
lm_ggml_tensor * cur_ffn = llm_build_ffn(ctx0, cur,
|
10675
|
+
lm_ggml_tensor * cur_ffn = llm_build_ffn(ctx0, lctx, cur,
|
10545
10676
|
model.layers[il].ffn_up_shexp, NULL, NULL,
|
10546
10677
|
model.layers[il].ffn_gate_shexp, NULL, NULL,
|
10547
10678
|
model.layers[il].ffn_down_shexp, NULL, NULL,
|
@@ -10574,7 +10705,7 @@ struct llm_build_context {
|
|
10574
10705
|
cb(cur, "result_norm", -1);
|
10575
10706
|
|
10576
10707
|
// lm_head
|
10577
|
-
cur =
|
10708
|
+
cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
|
10578
10709
|
cb(cur, "result_output", -1);
|
10579
10710
|
|
10580
10711
|
lm_ggml_build_forward_expand(gf, cur);
|
@@ -10616,7 +10747,7 @@ struct llm_build_context {
|
|
10616
10747
|
struct lm_ggml_tensor * Vcur = nullptr;
|
10617
10748
|
|
10618
10749
|
if (model.layers[il].wqkv) {
|
10619
|
-
cur =
|
10750
|
+
cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wqkv, attn_norm_output);
|
10620
10751
|
cb(cur, "wqkv", il);
|
10621
10752
|
|
10622
10753
|
cur = lm_ggml_add(ctx0, cur, model.layers[il].bqkv);
|
@@ -10626,9 +10757,9 @@ struct llm_build_context {
|
|
10626
10757
|
Kcur = lm_ggml_cont(ctx0, lm_ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
|
10627
10758
|
Vcur = lm_ggml_cont(ctx0, lm_ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
|
10628
10759
|
} else {
|
10629
|
-
Qcur = lm_ggml_add(ctx0,
|
10630
|
-
Kcur = lm_ggml_add(ctx0,
|
10631
|
-
Vcur = lm_ggml_add(ctx0,
|
10760
|
+
Qcur = lm_ggml_add(ctx0, llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, attn_norm_output), model.layers[il].bq);
|
10761
|
+
Kcur = lm_ggml_add(ctx0, llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, attn_norm_output), model.layers[il].bk);
|
10762
|
+
Vcur = lm_ggml_add(ctx0, llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, attn_norm_output), model.layers[il].bv);
|
10632
10763
|
}
|
10633
10764
|
|
10634
10765
|
cb(Qcur, "Qcur", il);
|
@@ -10655,7 +10786,7 @@ struct llm_build_context {
|
|
10655
10786
|
);
|
10656
10787
|
cb(Kcur, "Kcur", il);
|
10657
10788
|
|
10658
|
-
cur = llm_build_kv(ctx0,
|
10789
|
+
cur = llm_build_kv(ctx0, lctx, kv_self, gf,
|
10659
10790
|
model.layers[il].wo, model.layers[il].bo,
|
10660
10791
|
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f, cb, il);
|
10661
10792
|
}
|
@@ -10670,7 +10801,7 @@ struct llm_build_context {
|
|
10670
10801
|
|
10671
10802
|
// FF
|
10672
10803
|
{
|
10673
|
-
ffn_output = llm_build_ffn(ctx0, attn_norm_output,
|
10804
|
+
ffn_output = llm_build_ffn(ctx0, lctx, attn_norm_output,
|
10674
10805
|
model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
|
10675
10806
|
NULL, NULL, NULL,
|
10676
10807
|
model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
|
@@ -10694,7 +10825,7 @@ struct llm_build_context {
|
|
10694
10825
|
LLM_NORM, cb, -1);
|
10695
10826
|
cb(cur, "result_norm", -1);
|
10696
10827
|
|
10697
|
-
cur =
|
10828
|
+
cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
|
10698
10829
|
cb(cur, "result_output_no_bias", -1);
|
10699
10830
|
|
10700
10831
|
cur = lm_ggml_add(ctx0, cur, model.output_b);
|
@@ -10740,7 +10871,7 @@ struct llm_build_context {
|
|
10740
10871
|
struct lm_ggml_tensor * Vcur = nullptr;
|
10741
10872
|
|
10742
10873
|
if (model.layers[il].wqkv) {
|
10743
|
-
cur =
|
10874
|
+
cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wqkv, attn_norm_output);
|
10744
10875
|
cb(cur, "wqkv", il);
|
10745
10876
|
|
10746
10877
|
Qcur = lm_ggml_cont(ctx0, lm_ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0 * sizeof(float) * (n_embd)));
|
@@ -10748,9 +10879,9 @@ struct llm_build_context {
|
|
10748
10879
|
Vcur = lm_ggml_cont(ctx0, lm_ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1 * sizeof(float) * (n_embd + n_embd_gqa)));
|
10749
10880
|
}
|
10750
10881
|
else {
|
10751
|
-
Qcur = lm_ggml_add(ctx0,
|
10752
|
-
Kcur = lm_ggml_add(ctx0,
|
10753
|
-
Vcur = lm_ggml_add(ctx0,
|
10882
|
+
Qcur = lm_ggml_add(ctx0, llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, attn_norm_output), model.layers[il].bq);
|
10883
|
+
Kcur = lm_ggml_add(ctx0, llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, attn_norm_output), model.layers[il].bk);
|
10884
|
+
Vcur = lm_ggml_add(ctx0, llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, attn_norm_output), model.layers[il].bv);
|
10754
10885
|
}
|
10755
10886
|
|
10756
10887
|
cb(Qcur, "Qcur", il);
|
@@ -10775,7 +10906,7 @@ struct llm_build_context {
|
|
10775
10906
|
);
|
10776
10907
|
cb(Kcur, "Kcur", il);
|
10777
10908
|
|
10778
|
-
cur = llm_build_kv(ctx0,
|
10909
|
+
cur = llm_build_kv(ctx0, lctx, kv_self, gf,
|
10779
10910
|
model.layers[il].wo, model.layers[il].bo,
|
10780
10911
|
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f, cb, il);
|
10781
10912
|
}
|
@@ -10799,7 +10930,7 @@ struct llm_build_context {
|
|
10799
10930
|
// special-case: the up and gate tensors are merged into a single tensor
|
10800
10931
|
// TOOD: support into llm_build_ffn
|
10801
10932
|
{
|
10802
|
-
cur = llm_build_ffn(ctx0, cur,
|
10933
|
+
cur = llm_build_ffn(ctx0, lctx, cur,
|
10803
10934
|
model.layers[il].ffn_up, NULL, NULL,
|
10804
10935
|
NULL, NULL, NULL,
|
10805
10936
|
model.layers[il].ffn_down, NULL, NULL,
|
@@ -10822,7 +10953,7 @@ struct llm_build_context {
|
|
10822
10953
|
LLM_NORM_RMS, cb, -1);
|
10823
10954
|
cb(cur, "result_norm", -1);
|
10824
10955
|
|
10825
|
-
cur =
|
10956
|
+
cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
|
10826
10957
|
cb(cur, "result_output", -1);
|
10827
10958
|
|
10828
10959
|
lm_ggml_build_forward_expand(gf, cur);
|
@@ -10862,13 +10993,13 @@ struct llm_build_context {
|
|
10862
10993
|
// self-attention
|
10863
10994
|
{
|
10864
10995
|
// compute Q and K and RoPE them
|
10865
|
-
struct lm_ggml_tensor * Qcur =
|
10996
|
+
struct lm_ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
|
10866
10997
|
cb(Qcur, "Qcur", il);
|
10867
10998
|
|
10868
|
-
struct lm_ggml_tensor * Kcur =
|
10999
|
+
struct lm_ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
|
10869
11000
|
cb(Kcur, "Kcur", il);
|
10870
11001
|
|
10871
|
-
struct lm_ggml_tensor * Vcur =
|
11002
|
+
struct lm_ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
|
10872
11003
|
cb(Vcur, "Vcur", il);
|
10873
11004
|
|
10874
11005
|
Qcur = lm_ggml_rope_ext(
|
@@ -10883,7 +11014,7 @@ struct llm_build_context {
|
|
10883
11014
|
ext_factor, attn_factor, beta_fast, beta_slow);
|
10884
11015
|
cb(Kcur, "Kcur", il);
|
10885
11016
|
|
10886
|
-
cur = llm_build_kv(ctx0,
|
11017
|
+
cur = llm_build_kv(ctx0, lctx, kv_self, gf,
|
10887
11018
|
model.layers[il].wo, NULL,
|
10888
11019
|
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
10889
11020
|
}
|
@@ -10901,7 +11032,7 @@ struct llm_build_context {
|
|
10901
11032
|
|
10902
11033
|
// feed-forward network
|
10903
11034
|
{
|
10904
|
-
cur = llm_build_ffn(ctx0, cur,
|
11035
|
+
cur = llm_build_ffn(ctx0, lctx, cur,
|
10905
11036
|
model.layers[il].ffn_up, NULL, NULL,
|
10906
11037
|
model.layers[il].ffn_gate, NULL, NULL,
|
10907
11038
|
model.layers[il].ffn_down, NULL, NULL,
|
@@ -10927,7 +11058,7 @@ struct llm_build_context {
|
|
10927
11058
|
cb(cur, "result_norm", -1);
|
10928
11059
|
|
10929
11060
|
// lm_head
|
10930
|
-
cur =
|
11061
|
+
cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
|
10931
11062
|
cb(cur, "result_output", -1);
|
10932
11063
|
|
10933
11064
|
lm_ggml_build_forward_expand(gf, cur);
|
@@ -10969,7 +11100,7 @@ struct llm_build_context {
|
|
10969
11100
|
|
10970
11101
|
// self-attention
|
10971
11102
|
{
|
10972
|
-
cur =
|
11103
|
+
cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wqkv, cur);
|
10973
11104
|
cb(cur, "wqkv", il);
|
10974
11105
|
|
10975
11106
|
cur = lm_ggml_add(ctx0, cur, model.layers[il].bqkv);
|
@@ -10985,7 +11116,7 @@ struct llm_build_context {
|
|
10985
11116
|
|
10986
11117
|
Qcur = lm_ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
10987
11118
|
|
10988
|
-
cur = llm_build_kv(ctx0,
|
11119
|
+
cur = llm_build_kv(ctx0, lctx, kv_self, gf,
|
10989
11120
|
model.layers[il].wo, model.layers[il].bo,
|
10990
11121
|
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
10991
11122
|
}
|
@@ -11009,7 +11140,7 @@ struct llm_build_context {
|
|
11009
11140
|
LLM_NORM, cb, il);
|
11010
11141
|
cb(cur, "ffn_norm", il);
|
11011
11142
|
|
11012
|
-
cur = llm_build_ffn(ctx0, cur,
|
11143
|
+
cur = llm_build_ffn(ctx0, lctx, cur,
|
11013
11144
|
model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
|
11014
11145
|
NULL, NULL, NULL,
|
11015
11146
|
model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
|
@@ -11032,7 +11163,7 @@ struct llm_build_context {
|
|
11032
11163
|
LLM_NORM, cb, -1);
|
11033
11164
|
cb(cur, "result_norm", -1);
|
11034
11165
|
|
11035
|
-
cur =
|
11166
|
+
cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
|
11036
11167
|
cb(cur, "result_output", -1);
|
11037
11168
|
|
11038
11169
|
lm_ggml_build_forward_expand(gf, cur);
|
@@ -11068,7 +11199,7 @@ struct llm_build_context {
|
|
11068
11199
|
|
11069
11200
|
// self-attention
|
11070
11201
|
{
|
11071
|
-
cur =
|
11202
|
+
cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wqkv, cur);
|
11072
11203
|
cb(cur, "wqkv", il);
|
11073
11204
|
|
11074
11205
|
cur = lm_ggml_add(ctx0, cur, model.layers[il].bqkv);
|
@@ -11096,7 +11227,7 @@ struct llm_build_context {
|
|
11096
11227
|
);
|
11097
11228
|
cb(Kcur, "Kcur", il);
|
11098
11229
|
|
11099
|
-
cur = llm_build_kv(ctx0,
|
11230
|
+
cur = llm_build_kv(ctx0, lctx, kv_self, gf,
|
11100
11231
|
model.layers[il].wo, model.layers[il].bo,
|
11101
11232
|
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
11102
11233
|
}
|
@@ -11120,7 +11251,7 @@ struct llm_build_context {
|
|
11120
11251
|
LLM_NORM, cb, il);
|
11121
11252
|
cb(cur, "ffn_norm", il);
|
11122
11253
|
|
11123
|
-
cur = llm_build_ffn(ctx0, cur,
|
11254
|
+
cur = llm_build_ffn(ctx0, lctx, cur,
|
11124
11255
|
model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
|
11125
11256
|
NULL, NULL, NULL,
|
11126
11257
|
model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
|
@@ -11143,7 +11274,7 @@ struct llm_build_context {
|
|
11143
11274
|
LLM_NORM, cb, -1);
|
11144
11275
|
cb(cur, "result_norm", -1);
|
11145
11276
|
|
11146
|
-
cur =
|
11277
|
+
cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
|
11147
11278
|
cb(cur, "result_output", -1);
|
11148
11279
|
|
11149
11280
|
lm_ggml_build_forward_expand(gf, cur);
|
@@ -11181,21 +11312,21 @@ struct llm_build_context {
|
|
11181
11312
|
// self-attention
|
11182
11313
|
{
|
11183
11314
|
// compute Q and K and RoPE them
|
11184
|
-
struct lm_ggml_tensor * Qcur =
|
11315
|
+
struct lm_ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
|
11185
11316
|
cb(Qcur, "Qcur", il);
|
11186
11317
|
// if (model.layers[il].bq) {
|
11187
11318
|
// Qcur = lm_ggml_add(ctx0, Qcur, model.layers[il].bq);
|
11188
11319
|
// cb(Qcur, "Qcur", il);
|
11189
11320
|
// }
|
11190
11321
|
|
11191
|
-
struct lm_ggml_tensor * Kcur =
|
11322
|
+
struct lm_ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
|
11192
11323
|
cb(Kcur, "Kcur", il);
|
11193
11324
|
// if (model.layers[il].bk) {
|
11194
11325
|
// Kcur = lm_ggml_add(ctx0, Kcur, model.layers[il].bk);
|
11195
11326
|
// cb(Kcur, "Kcur", il);
|
11196
11327
|
// }
|
11197
11328
|
|
11198
|
-
struct lm_ggml_tensor * Vcur =
|
11329
|
+
struct lm_ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
|
11199
11330
|
cb(Vcur, "Vcur", il);
|
11200
11331
|
// if (model.layers[il].bv) {
|
11201
11332
|
// Vcur = lm_ggml_add(ctx0, Vcur, model.layers[il].bv);
|
@@ -11216,7 +11347,7 @@ struct llm_build_context {
|
|
11216
11347
|
);
|
11217
11348
|
cb(Kcur, "Kcur", il);
|
11218
11349
|
|
11219
|
-
cur = llm_build_kv(ctx0,
|
11350
|
+
cur = llm_build_kv(ctx0, lctx, kv_self, gf,
|
11220
11351
|
model.layers[il].wo, NULL,
|
11221
11352
|
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
11222
11353
|
}
|
@@ -11237,7 +11368,7 @@ struct llm_build_context {
|
|
11237
11368
|
LLM_NORM, cb, il);
|
11238
11369
|
cb(cur, "ffn_norm", il);
|
11239
11370
|
|
11240
|
-
cur = llm_build_ffn(ctx0, cur,
|
11371
|
+
cur = llm_build_ffn(ctx0, lctx, cur,
|
11241
11372
|
model.layers[il].ffn_up, NULL, NULL,
|
11242
11373
|
model.layers[il].ffn_gate, NULL, NULL,
|
11243
11374
|
model.layers[il].ffn_down, NULL, NULL,
|
@@ -11261,7 +11392,7 @@ struct llm_build_context {
|
|
11261
11392
|
cb(cur, "result_norm", -1);
|
11262
11393
|
|
11263
11394
|
// lm_head
|
11264
|
-
cur =
|
11395
|
+
cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
|
11265
11396
|
cb(cur, "result_output", -1);
|
11266
11397
|
|
11267
11398
|
lm_ggml_build_forward_expand(gf, cur);
|
@@ -11299,21 +11430,21 @@ struct llm_build_context {
|
|
11299
11430
|
// self-attention
|
11300
11431
|
{
|
11301
11432
|
// compute Q and K and RoPE them
|
11302
|
-
struct lm_ggml_tensor * Qcur =
|
11433
|
+
struct lm_ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
|
11303
11434
|
cb(Qcur, "Qcur", il);
|
11304
11435
|
if (model.layers[il].bq) {
|
11305
11436
|
Qcur = lm_ggml_add(ctx0, Qcur, model.layers[il].bq);
|
11306
11437
|
cb(Qcur, "Qcur", il);
|
11307
11438
|
}
|
11308
11439
|
|
11309
|
-
struct lm_ggml_tensor * Kcur =
|
11440
|
+
struct lm_ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
|
11310
11441
|
cb(Kcur, "Kcur", il);
|
11311
11442
|
if (model.layers[il].bk) {
|
11312
11443
|
Kcur = lm_ggml_add(ctx0, Kcur, model.layers[il].bk);
|
11313
11444
|
cb(Kcur, "Kcur", il);
|
11314
11445
|
}
|
11315
11446
|
|
11316
|
-
struct lm_ggml_tensor * Vcur =
|
11447
|
+
struct lm_ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
|
11317
11448
|
cb(Vcur, "Vcur", il);
|
11318
11449
|
if (model.layers[il].bv) {
|
11319
11450
|
Vcur = lm_ggml_add(ctx0, Vcur, model.layers[il].bv);
|
@@ -11334,7 +11465,7 @@ struct llm_build_context {
|
|
11334
11465
|
);
|
11335
11466
|
cb(Kcur, "Kcur", il);
|
11336
11467
|
|
11337
|
-
cur = llm_build_kv(ctx0,
|
11468
|
+
cur = llm_build_kv(ctx0, lctx, kv_self, gf,
|
11338
11469
|
model.layers[il].wo, model.layers[il].bo,
|
11339
11470
|
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
11340
11471
|
}
|
@@ -11355,7 +11486,7 @@ struct llm_build_context {
|
|
11355
11486
|
LLM_NORM_RMS, cb, il);
|
11356
11487
|
cb(cur, "ffn_norm", il);
|
11357
11488
|
|
11358
|
-
cur = llm_build_ffn(ctx0, cur,
|
11489
|
+
cur = llm_build_ffn(ctx0, lctx, cur,
|
11359
11490
|
model.layers[il].ffn_up, NULL, NULL,
|
11360
11491
|
model.layers[il].ffn_gate, NULL, NULL,
|
11361
11492
|
model.layers[il].ffn_down, NULL, NULL,
|
@@ -11379,7 +11510,7 @@ struct llm_build_context {
|
|
11379
11510
|
cb(cur, "result_norm", -1);
|
11380
11511
|
|
11381
11512
|
// lm_head
|
11382
|
-
cur =
|
11513
|
+
cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
|
11383
11514
|
cb(cur, "result_output", -1);
|
11384
11515
|
|
11385
11516
|
lm_ggml_build_forward_expand(gf, cur);
|
@@ -11430,21 +11561,21 @@ struct llm_build_context {
|
|
11430
11561
|
// self-attention
|
11431
11562
|
{
|
11432
11563
|
// compute Q and K and RoPE them
|
11433
|
-
struct lm_ggml_tensor * Qcur =
|
11564
|
+
struct lm_ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
|
11434
11565
|
cb(Qcur, "Qcur", il);
|
11435
11566
|
if (model.layers[il].bq) {
|
11436
11567
|
Qcur = lm_ggml_add(ctx0, Qcur, model.layers[il].bq);
|
11437
11568
|
cb(Qcur, "Qcur", il);
|
11438
11569
|
}
|
11439
11570
|
|
11440
|
-
struct lm_ggml_tensor * Kcur =
|
11571
|
+
struct lm_ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
|
11441
11572
|
cb(Kcur, "Kcur", il);
|
11442
11573
|
if (model.layers[il].bk) {
|
11443
11574
|
Kcur = lm_ggml_add(ctx0, Kcur, model.layers[il].bk);
|
11444
11575
|
cb(Kcur, "Kcur", il);
|
11445
11576
|
}
|
11446
11577
|
|
11447
|
-
struct lm_ggml_tensor * Vcur =
|
11578
|
+
struct lm_ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
|
11448
11579
|
cb(Vcur, "Vcur", il);
|
11449
11580
|
if (model.layers[il].bv) {
|
11450
11581
|
Vcur = lm_ggml_add(ctx0, Vcur, model.layers[il].bv);
|
@@ -11465,7 +11596,7 @@ struct llm_build_context {
|
|
11465
11596
|
);
|
11466
11597
|
cb(Kcur, "Kcur", il);
|
11467
11598
|
|
11468
|
-
cur = llm_build_kv(ctx0,
|
11599
|
+
cur = llm_build_kv(ctx0, lctx, kv_self, gf,
|
11469
11600
|
model.layers[il].wo, model.layers[il].bo,
|
11470
11601
|
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
11471
11602
|
}
|
@@ -11492,7 +11623,7 @@ struct llm_build_context {
|
|
11492
11623
|
LLM_NORM_RMS, cb, il);
|
11493
11624
|
cb(cur, "ffn_norm", il);
|
11494
11625
|
|
11495
|
-
cur = llm_build_ffn(ctx0, cur,
|
11626
|
+
cur = llm_build_ffn(ctx0, lctx, cur,
|
11496
11627
|
model.layers[il].ffn_up, NULL, NULL,
|
11497
11628
|
model.layers[il].ffn_gate, NULL, NULL,
|
11498
11629
|
model.layers[il].ffn_down, NULL, NULL,
|
@@ -11526,7 +11657,7 @@ struct llm_build_context {
|
|
11526
11657
|
cb(cur, "lmhead_scaling", -1);
|
11527
11658
|
|
11528
11659
|
// lm_head
|
11529
|
-
cur =
|
11660
|
+
cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
|
11530
11661
|
cb(cur, "result_output", -1);
|
11531
11662
|
|
11532
11663
|
lm_ggml_build_forward_expand(gf, cur);
|
@@ -11563,13 +11694,13 @@ struct llm_build_context {
|
|
11563
11694
|
// self-attention
|
11564
11695
|
{
|
11565
11696
|
// compute Q and K and RoPE them
|
11566
|
-
struct lm_ggml_tensor * Qcur =
|
11697
|
+
struct lm_ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
|
11567
11698
|
cb(Qcur, "Qcur", il);
|
11568
11699
|
|
11569
|
-
struct lm_ggml_tensor * Kcur =
|
11700
|
+
struct lm_ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
|
11570
11701
|
cb(Kcur, "Kcur", il);
|
11571
11702
|
|
11572
|
-
struct lm_ggml_tensor * Vcur =
|
11703
|
+
struct lm_ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
|
11573
11704
|
cb(Vcur, "Vcur", il);
|
11574
11705
|
|
11575
11706
|
Qcur = lm_ggml_rope_ext(
|
@@ -11587,7 +11718,7 @@ struct llm_build_context {
|
|
11587
11718
|
ext_factor, attn_factor, beta_fast, beta_slow);
|
11588
11719
|
cb(Kcur, "Kcur", il);
|
11589
11720
|
|
11590
|
-
cur = llm_build_kv(ctx0,
|
11721
|
+
cur = llm_build_kv(ctx0, lctx, kv_self, gf,
|
11591
11722
|
model.layers[il].wo, NULL,
|
11592
11723
|
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f, cb, il);
|
11593
11724
|
}
|
@@ -11609,7 +11740,7 @@ struct llm_build_context {
|
|
11609
11740
|
|
11610
11741
|
// feed-forward network
|
11611
11742
|
{
|
11612
|
-
cur = llm_build_ffn(ctx0, cur,
|
11743
|
+
cur = llm_build_ffn(ctx0, lctx, cur,
|
11613
11744
|
model.layers[il].ffn_up, NULL, NULL,
|
11614
11745
|
model.layers[il].ffn_gate, NULL, NULL,
|
11615
11746
|
model.layers[il].ffn_down, NULL, NULL,
|
@@ -11634,7 +11765,7 @@ struct llm_build_context {
|
|
11634
11765
|
cb(cur, "result_norm", -1);
|
11635
11766
|
|
11636
11767
|
// lm_head
|
11637
|
-
cur =
|
11768
|
+
cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
|
11638
11769
|
cb(cur, "result_output", -1);
|
11639
11770
|
|
11640
11771
|
lm_ggml_build_forward_expand(gf, cur);
|
@@ -11676,13 +11807,13 @@ struct llm_build_context {
|
|
11676
11807
|
// self-attention
|
11677
11808
|
{
|
11678
11809
|
// compute Q and K and RoPE them
|
11679
|
-
struct lm_ggml_tensor * Qcur =
|
11810
|
+
struct lm_ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
|
11680
11811
|
cb(Qcur, "Qcur", il);
|
11681
11812
|
|
11682
|
-
struct lm_ggml_tensor * Kcur =
|
11813
|
+
struct lm_ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
|
11683
11814
|
cb(Kcur, "Kcur", il);
|
11684
11815
|
|
11685
|
-
struct lm_ggml_tensor * Vcur =
|
11816
|
+
struct lm_ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
|
11686
11817
|
cb(Vcur, "Vcur", il);
|
11687
11818
|
|
11688
11819
|
Qcur = lm_ggml_rope_ext(
|
@@ -11705,7 +11836,7 @@ struct llm_build_context {
|
|
11705
11836
|
ext_factor, attn_factor, beta_fast, beta_slow);
|
11706
11837
|
cb(Kcur, "Kcur", il);
|
11707
11838
|
|
11708
|
-
cur = llm_build_kv(ctx0,
|
11839
|
+
cur = llm_build_kv(ctx0, lctx, kv_self, gf,
|
11709
11840
|
model.layers[il].wo, NULL,
|
11710
11841
|
Kcur, Vcur, Qcur, KQ_mask_l, n_tokens, kv_head, n_kv, 1.0f, cb, il);
|
11711
11842
|
}
|
@@ -11732,7 +11863,7 @@ struct llm_build_context {
|
|
11732
11863
|
|
11733
11864
|
// feed-forward network
|
11734
11865
|
{
|
11735
|
-
cur = llm_build_ffn(ctx0, cur,
|
11866
|
+
cur = llm_build_ffn(ctx0, lctx, cur,
|
11736
11867
|
model.layers[il].ffn_up, NULL, NULL,
|
11737
11868
|
model.layers[il].ffn_gate, NULL, NULL,
|
11738
11869
|
model.layers[il].ffn_down, NULL, NULL,
|
@@ -11762,7 +11893,7 @@ struct llm_build_context {
|
|
11762
11893
|
cb(cur, "result_norm", -1);
|
11763
11894
|
|
11764
11895
|
// lm_head
|
11765
|
-
cur =
|
11896
|
+
cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
|
11766
11897
|
|
11767
11898
|
// final logit soft-capping
|
11768
11899
|
cur = lm_ggml_scale(ctx0, cur, 1.0f / hparams.f_final_logit_softcapping);
|
@@ -11807,21 +11938,21 @@ struct llm_build_context {
|
|
11807
11938
|
// self-attention
|
11808
11939
|
{
|
11809
11940
|
// compute Q and K and RoPE them
|
11810
|
-
struct lm_ggml_tensor * Qcur =
|
11941
|
+
struct lm_ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
|
11811
11942
|
cb(Qcur, "Qcur", il);
|
11812
11943
|
if (model.layers[il].bq) {
|
11813
11944
|
Qcur = lm_ggml_add(ctx0, Qcur, model.layers[il].bq);
|
11814
11945
|
cb(Qcur, "Qcur", il);
|
11815
11946
|
}
|
11816
11947
|
|
11817
|
-
struct lm_ggml_tensor * Kcur =
|
11948
|
+
struct lm_ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
|
11818
11949
|
cb(Kcur, "Kcur", il);
|
11819
11950
|
if (model.layers[il].bk) {
|
11820
11951
|
Kcur = lm_ggml_add(ctx0, Kcur, model.layers[il].bk);
|
11821
11952
|
cb(Kcur, "Kcur", il);
|
11822
11953
|
}
|
11823
11954
|
|
11824
|
-
struct lm_ggml_tensor * Vcur =
|
11955
|
+
struct lm_ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
|
11825
11956
|
cb(Vcur, "Vcur", il);
|
11826
11957
|
if (model.layers[il].bv) {
|
11827
11958
|
Vcur = lm_ggml_add(ctx0, Vcur, model.layers[il].bv);
|
@@ -11842,7 +11973,7 @@ struct llm_build_context {
|
|
11842
11973
|
);
|
11843
11974
|
cb(Kcur, "Kcur", il);
|
11844
11975
|
|
11845
|
-
cur = llm_build_kv(ctx0,
|
11976
|
+
cur = llm_build_kv(ctx0, lctx, kv_self, gf,
|
11846
11977
|
model.layers[il].wo, model.layers[il].bo,
|
11847
11978
|
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
11848
11979
|
}
|
@@ -11864,7 +11995,7 @@ struct llm_build_context {
|
|
11864
11995
|
LLM_NORM, cb, il);
|
11865
11996
|
cb(cur, "ffn_norm", il);
|
11866
11997
|
|
11867
|
-
cur = llm_build_ffn(ctx0, cur,
|
11998
|
+
cur = llm_build_ffn(ctx0, lctx, cur,
|
11868
11999
|
model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
|
11869
12000
|
NULL, NULL, NULL,
|
11870
12001
|
model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
|
@@ -11888,7 +12019,7 @@ struct llm_build_context {
|
|
11888
12019
|
cb(cur, "result_norm", -1);
|
11889
12020
|
|
11890
12021
|
// lm_head
|
11891
|
-
cur =
|
12022
|
+
cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
|
11892
12023
|
cb(cur, "result_output", -1);
|
11893
12024
|
|
11894
12025
|
lm_ggml_build_forward_expand(gf, cur);
|
@@ -11940,7 +12071,7 @@ struct llm_build_context {
|
|
11940
12071
|
cb(cur, "attn_norm", il);
|
11941
12072
|
|
11942
12073
|
// {n_embd, 2*d_inner} * {n_embd, n_tokens} => {2*d_inner, n_tokens}
|
11943
|
-
struct lm_ggml_tensor * xz =
|
12074
|
+
struct lm_ggml_tensor * xz = llm_build_lora_mm(lctx, ctx0, model.layers[il].ssm_in, cur);
|
11944
12075
|
// split the above in two
|
11945
12076
|
// => {d_inner, n_tokens}
|
11946
12077
|
struct lm_ggml_tensor * x = lm_ggml_view_2d(ctx0, xz, d_inner, xz->ne[1], xz->nb[1], 0);
|
@@ -11979,14 +12110,14 @@ struct llm_build_context {
|
|
11979
12110
|
// ssm
|
11980
12111
|
{
|
11981
12112
|
// {d_inner, dt_rank + 2*d_state} * {d_inner, n_tokens} => {dt_rank + 2*d_state, n_tokens}
|
11982
|
-
struct lm_ggml_tensor * x_db =
|
12113
|
+
struct lm_ggml_tensor * x_db = llm_build_lora_mm(lctx, ctx0, model.layers[il].ssm_x, x);
|
11983
12114
|
// split
|
11984
12115
|
struct lm_ggml_tensor * dt = lm_ggml_view_2d(ctx0, x_db, dt_rank, n_tokens, x_db->nb[1], 0);
|
11985
12116
|
struct lm_ggml_tensor * B = lm_ggml_view_2d(ctx0, x_db, d_state, n_tokens, x_db->nb[1], lm_ggml_element_size(x_db)*dt_rank);
|
11986
12117
|
struct lm_ggml_tensor * C = lm_ggml_view_2d(ctx0, x_db, d_state, n_tokens, x_db->nb[1], lm_ggml_element_size(x_db)*(dt_rank+d_state));
|
11987
12118
|
|
11988
12119
|
// {dt_rank, d_inner} * {dt_rank, n_tokens} => {d_inner, n_tokens}
|
11989
|
-
dt =
|
12120
|
+
dt = llm_build_lora_mm(lctx, ctx0, model.layers[il].ssm_dt, dt);
|
11990
12121
|
dt = lm_ggml_add(ctx0, dt, model.layers[il].ssm_dt_b);
|
11991
12122
|
|
11992
12123
|
// Custom operator to optimize the parallel associative scan
|
@@ -12017,7 +12148,7 @@ struct llm_build_context {
|
|
12017
12148
|
y = lm_ggml_mul(ctx0, y, lm_ggml_silu(ctx0, z));
|
12018
12149
|
|
12019
12150
|
// {d_inner, n_embd} * {d_inner, n_tokens} => {n_embd, n_tokens}
|
12020
|
-
cur =
|
12151
|
+
cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].ssm_out, y);
|
12021
12152
|
}
|
12022
12153
|
|
12023
12154
|
// residual
|
@@ -12036,7 +12167,7 @@ struct llm_build_context {
|
|
12036
12167
|
cb(cur, "result_norm", -1);
|
12037
12168
|
|
12038
12169
|
// lm_head
|
12039
|
-
cur =
|
12170
|
+
cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
|
12040
12171
|
cb(cur, "result_output", -1);
|
12041
12172
|
|
12042
12173
|
lm_ggml_build_forward_expand(gf, cur);
|
@@ -12075,21 +12206,21 @@ struct llm_build_context {
|
|
12075
12206
|
// self-attention
|
12076
12207
|
{
|
12077
12208
|
// compute Q and K and RoPE them
|
12078
|
-
struct lm_ggml_tensor * Qcur =
|
12209
|
+
struct lm_ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
|
12079
12210
|
cb(Qcur, "Qcur", il);
|
12080
12211
|
if (model.layers[il].bq) {
|
12081
12212
|
Qcur = lm_ggml_add(ctx0, Qcur, model.layers[il].bq);
|
12082
12213
|
cb(Qcur, "Qcur", il);
|
12083
12214
|
}
|
12084
12215
|
|
12085
|
-
struct lm_ggml_tensor * Kcur =
|
12216
|
+
struct lm_ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
|
12086
12217
|
cb(Kcur, "Kcur", il);
|
12087
12218
|
if (model.layers[il].bk) {
|
12088
12219
|
Kcur = lm_ggml_add(ctx0, Kcur, model.layers[il].bk);
|
12089
12220
|
cb(Kcur, "Kcur", il);
|
12090
12221
|
}
|
12091
12222
|
|
12092
|
-
struct lm_ggml_tensor * Vcur =
|
12223
|
+
struct lm_ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
|
12093
12224
|
cb(Vcur, "Vcur", il);
|
12094
12225
|
if (model.layers[il].bv) {
|
12095
12226
|
Vcur = lm_ggml_add(ctx0, Vcur, model.layers[il].bv);
|
@@ -12135,7 +12266,7 @@ struct llm_build_context {
|
|
12135
12266
|
);
|
12136
12267
|
cb(Kcur, "Kcur", il);
|
12137
12268
|
|
12138
|
-
cur = llm_build_kv(ctx0,
|
12269
|
+
cur = llm_build_kv(ctx0, lctx, kv_self, gf,
|
12139
12270
|
model.layers[il].wo, model.layers[il].bo,
|
12140
12271
|
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
12141
12272
|
}
|
@@ -12152,7 +12283,7 @@ struct llm_build_context {
|
|
12152
12283
|
|
12153
12284
|
// feed-forward network
|
12154
12285
|
{
|
12155
|
-
cur = llm_build_ffn(ctx0, ffn_inp,
|
12286
|
+
cur = llm_build_ffn(ctx0, lctx, ffn_inp,
|
12156
12287
|
model.layers[il].ffn_up, NULL, NULL,
|
12157
12288
|
model.layers[il].ffn_gate, NULL, NULL,
|
12158
12289
|
model.layers[il].ffn_down, NULL, NULL,
|
@@ -12179,7 +12310,7 @@ struct llm_build_context {
|
|
12179
12310
|
cb(cur, "result_norm", -1);
|
12180
12311
|
|
12181
12312
|
// lm_head
|
12182
|
-
cur =
|
12313
|
+
cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
|
12183
12314
|
|
12184
12315
|
if (f_logit_scale) {
|
12185
12316
|
cur = lm_ggml_scale(ctx0, cur, f_logit_scale);
|
@@ -12232,21 +12363,21 @@ struct llm_build_context {
|
|
12232
12363
|
// self-attention
|
12233
12364
|
{
|
12234
12365
|
// compute Q and K and RoPE them
|
12235
|
-
struct lm_ggml_tensor * Qcur =
|
12366
|
+
struct lm_ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
|
12236
12367
|
cb(Qcur, "Qcur", il);
|
12237
12368
|
if (hparams.f_clamp_kqv > 0.0f) {
|
12238
12369
|
Qcur = lm_ggml_clamp(ctx0, Qcur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv);
|
12239
12370
|
cb(Qcur, "Qcur", il);
|
12240
12371
|
}
|
12241
12372
|
|
12242
|
-
struct lm_ggml_tensor * Kcur =
|
12373
|
+
struct lm_ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
|
12243
12374
|
cb(Kcur, "Kcur", il);
|
12244
12375
|
if (hparams.f_clamp_kqv > 0.0f) {
|
12245
12376
|
Kcur = lm_ggml_clamp(ctx0, Kcur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv);
|
12246
12377
|
cb(Kcur, "Kcur", il);
|
12247
12378
|
}
|
12248
12379
|
|
12249
|
-
struct lm_ggml_tensor * Vcur =
|
12380
|
+
struct lm_ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
|
12250
12381
|
cb(Vcur, "Vcur", il);
|
12251
12382
|
if (hparams.f_clamp_kqv > 0.0f) {
|
12252
12383
|
Vcur = lm_ggml_clamp(ctx0, Vcur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv);
|
@@ -12267,7 +12398,7 @@ struct llm_build_context {
|
|
12267
12398
|
);
|
12268
12399
|
cb(Kcur, "Kcur", il);
|
12269
12400
|
|
12270
|
-
cur = llm_build_kv(ctx0,
|
12401
|
+
cur = llm_build_kv(ctx0, lctx, kv_self, gf,
|
12271
12402
|
model.layers[il].wo, nullptr,
|
12272
12403
|
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
12273
12404
|
}
|
@@ -12289,7 +12420,7 @@ struct llm_build_context {
|
|
12289
12420
|
LLM_NORM, cb, il);
|
12290
12421
|
cb(cur, "ffn_norm", il);
|
12291
12422
|
|
12292
|
-
cur = llm_build_ffn(ctx0, cur,
|
12423
|
+
cur = llm_build_ffn(ctx0, lctx, cur,
|
12293
12424
|
model.layers[il].ffn_up, NULL, NULL,
|
12294
12425
|
model.layers[il].ffn_gate, NULL, NULL,
|
12295
12426
|
model.layers[il].ffn_down, NULL, NULL,
|
@@ -12315,7 +12446,7 @@ struct llm_build_context {
|
|
12315
12446
|
cb(cur, "result_norm", -1);
|
12316
12447
|
|
12317
12448
|
// lm_head
|
12318
|
-
cur =
|
12449
|
+
cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
|
12319
12450
|
cb(cur, "result_output", -1);
|
12320
12451
|
|
12321
12452
|
lm_ggml_build_forward_expand(gf, cur);
|
@@ -12355,7 +12486,7 @@ struct llm_build_context {
|
|
12355
12486
|
|
12356
12487
|
// self-attention
|
12357
12488
|
{
|
12358
|
-
cur =
|
12489
|
+
cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wqkv, cur);
|
12359
12490
|
cb(cur, "wqkv", il);
|
12360
12491
|
|
12361
12492
|
cur = lm_ggml_reshape_3d(ctx0, cur, n_embd_head_k, n_head_qkv, n_tokens);
|
@@ -12394,7 +12525,7 @@ struct llm_build_context {
|
|
12394
12525
|
Vcur = lm_ggml_reshape_2d(ctx0, Vcur, n_embd_head * n_head_kv, n_tokens);
|
12395
12526
|
cb(Qcur, "Vcur", il);
|
12396
12527
|
|
12397
|
-
cur = llm_build_kv(ctx0,
|
12528
|
+
cur = llm_build_kv(ctx0, lctx, kv_self, gf,
|
12398
12529
|
model.layers[il].wo, NULL,
|
12399
12530
|
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
12400
12531
|
}
|
@@ -12416,7 +12547,7 @@ struct llm_build_context {
|
|
12416
12547
|
LLM_NORM_RMS, cb, il);
|
12417
12548
|
cb(cur, "ffn_norm", il);
|
12418
12549
|
|
12419
|
-
cur = llm_build_ffn(ctx0, cur,
|
12550
|
+
cur = llm_build_ffn(ctx0, lctx, cur,
|
12420
12551
|
model.layers[il].ffn_up, NULL, NULL,
|
12421
12552
|
model.layers[il].ffn_gate, NULL, NULL,
|
12422
12553
|
model.layers[il].ffn_down, NULL, NULL,
|
@@ -12440,7 +12571,7 @@ struct llm_build_context {
|
|
12440
12571
|
LLM_NORM_RMS, cb, -1);
|
12441
12572
|
cb(cur, "result_norm", -1);
|
12442
12573
|
|
12443
|
-
cur =
|
12574
|
+
cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
|
12444
12575
|
cb(cur, "result_output", -1);
|
12445
12576
|
|
12446
12577
|
lm_ggml_build_forward_expand(gf, cur);
|
@@ -12475,7 +12606,7 @@ struct llm_build_context {
|
|
12475
12606
|
|
12476
12607
|
// self-attention
|
12477
12608
|
{
|
12478
|
-
cur =
|
12609
|
+
cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wqkv, cur);
|
12479
12610
|
cb(cur, "wqkv", il);
|
12480
12611
|
|
12481
12612
|
cur = lm_ggml_add(ctx0, cur, model.layers[il].bqkv);
|
@@ -12503,7 +12634,7 @@ struct llm_build_context {
|
|
12503
12634
|
);
|
12504
12635
|
cb(Kcur, "Kcur", il);
|
12505
12636
|
|
12506
|
-
cur = llm_build_kv(ctx0,
|
12637
|
+
cur = llm_build_kv(ctx0, lctx, kv_self, gf,
|
12507
12638
|
model.layers[il].wo, model.layers[il].bo,
|
12508
12639
|
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
12509
12640
|
}
|
@@ -12528,7 +12659,7 @@ struct llm_build_context {
|
|
12528
12659
|
LLM_NORM, cb, il);
|
12529
12660
|
cb(cur, "ffn_norm", il);
|
12530
12661
|
|
12531
|
-
cur = llm_build_ffn(ctx0, cur,
|
12662
|
+
cur = llm_build_ffn(ctx0, lctx, cur,
|
12532
12663
|
model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
|
12533
12664
|
NULL, NULL, NULL,
|
12534
12665
|
model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
|
@@ -12559,7 +12690,7 @@ struct llm_build_context {
|
|
12559
12690
|
LLM_NORM, cb, il);
|
12560
12691
|
cb(cur, "ffn_norm", il);
|
12561
12692
|
|
12562
|
-
cur = llm_build_ffn(ctx0, cur,
|
12693
|
+
cur = llm_build_ffn(ctx0, lctx, cur,
|
12563
12694
|
model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
|
12564
12695
|
NULL, NULL, NULL,
|
12565
12696
|
model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
|
@@ -12582,7 +12713,7 @@ struct llm_build_context {
|
|
12582
12713
|
LLM_NORM, cb, -1);
|
12583
12714
|
cb(cur, "result_norm", -1);
|
12584
12715
|
|
12585
|
-
cur =
|
12716
|
+
cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
|
12586
12717
|
cb(cur, "result_output", -1);
|
12587
12718
|
|
12588
12719
|
lm_ggml_build_forward_expand(gf, cur);
|
@@ -12623,13 +12754,13 @@ struct llm_build_context {
|
|
12623
12754
|
// self-attention
|
12624
12755
|
{
|
12625
12756
|
// compute Q and K and RoPE them
|
12626
|
-
struct lm_ggml_tensor * Qcur =
|
12757
|
+
struct lm_ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
|
12627
12758
|
cb(Qcur, "Qcur", il);
|
12628
12759
|
|
12629
|
-
struct lm_ggml_tensor * Kcur =
|
12760
|
+
struct lm_ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
|
12630
12761
|
cb(Kcur, "Kcur", il);
|
12631
12762
|
|
12632
|
-
struct lm_ggml_tensor * Vcur =
|
12763
|
+
struct lm_ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
|
12633
12764
|
cb(Vcur, "Vcur", il);
|
12634
12765
|
|
12635
12766
|
Qcur = lm_ggml_rope_ext(
|
@@ -12646,7 +12777,7 @@ struct llm_build_context {
|
|
12646
12777
|
);
|
12647
12778
|
cb(Kcur, "Kcur", il);
|
12648
12779
|
|
12649
|
-
cur = llm_build_kv(ctx0,
|
12780
|
+
cur = llm_build_kv(ctx0, lctx, kv_self, gf,
|
12650
12781
|
model.layers[il].wo, NULL,
|
12651
12782
|
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
12652
12783
|
}
|
@@ -12668,7 +12799,7 @@ struct llm_build_context {
|
|
12668
12799
|
LLM_NORM_RMS, cb, il);
|
12669
12800
|
cb(cur, "ffn_norm", il);
|
12670
12801
|
|
12671
|
-
cur = llm_build_ffn(ctx0, cur,
|
12802
|
+
cur = llm_build_ffn(ctx0, lctx, cur,
|
12672
12803
|
model.layers[il].ffn_up, NULL, NULL,
|
12673
12804
|
model.layers[il].ffn_gate, NULL, NULL,
|
12674
12805
|
model.layers[il].ffn_down, NULL, NULL,
|
@@ -12685,7 +12816,7 @@ struct llm_build_context {
|
|
12685
12816
|
LLM_NORM_RMS, cb, il);
|
12686
12817
|
cb(cur, "ffn_norm_exps", il);
|
12687
12818
|
|
12688
|
-
cur = llm_build_moe_ffn(ctx0, cur,
|
12819
|
+
cur = llm_build_moe_ffn(ctx0, lctx, cur,
|
12689
12820
|
model.layers[il].ffn_gate_inp,
|
12690
12821
|
model.layers[il].ffn_up_exps,
|
12691
12822
|
model.layers[il].ffn_gate_exps,
|
@@ -12714,7 +12845,7 @@ struct llm_build_context {
|
|
12714
12845
|
cb(cur, "result_norm", -1);
|
12715
12846
|
|
12716
12847
|
// lm_head
|
12717
|
-
cur =
|
12848
|
+
cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
|
12718
12849
|
cb(cur, "result_output", -1);
|
12719
12850
|
|
12720
12851
|
lm_ggml_build_forward_expand(gf, cur);
|
@@ -12868,7 +12999,7 @@ struct llm_build_context {
|
|
12868
12999
|
struct lm_ggml_tensor * k_states = lm_ggml_concat(ctx0, k_nope, lm_ggml_repeat(ctx0, k_pe, q_pe), 0);
|
12869
13000
|
cb(k_states, "k_states", il);
|
12870
13001
|
|
12871
|
-
cur = llm_build_kv(ctx0,
|
13002
|
+
cur = llm_build_kv(ctx0, lctx, kv_self, gf,
|
12872
13003
|
model.layers[il].wo, NULL,
|
12873
13004
|
k_states, v_states, q_states, KQ_mask, n_tokens, kv_head, n_kv, kq_scale, cb, il);
|
12874
13005
|
}
|
@@ -12890,7 +13021,7 @@ struct llm_build_context {
|
|
12890
13021
|
cb(cur, "ffn_norm", il);
|
12891
13022
|
|
12892
13023
|
if ((uint32_t) il < hparams.n_layer_dense_lead) {
|
12893
|
-
cur = llm_build_ffn(ctx0, cur,
|
13024
|
+
cur = llm_build_ffn(ctx0, lctx, cur,
|
12894
13025
|
model.layers[il].ffn_up, NULL, NULL,
|
12895
13026
|
model.layers[il].ffn_gate, NULL, NULL,
|
12896
13027
|
model.layers[il].ffn_down, NULL, NULL,
|
@@ -12900,7 +13031,7 @@ struct llm_build_context {
|
|
12900
13031
|
} else {
|
12901
13032
|
// MoE branch
|
12902
13033
|
lm_ggml_tensor * moe_out =
|
12903
|
-
llm_build_moe_ffn(ctx0, cur,
|
13034
|
+
llm_build_moe_ffn(ctx0, lctx, cur,
|
12904
13035
|
model.layers[il].ffn_gate_inp,
|
12905
13036
|
model.layers[il].ffn_up_exps,
|
12906
13037
|
model.layers[il].ffn_gate_exps,
|
@@ -12913,7 +13044,7 @@ struct llm_build_context {
|
|
12913
13044
|
|
12914
13045
|
// FFN shared expert
|
12915
13046
|
{
|
12916
|
-
lm_ggml_tensor * ffn_shexp = llm_build_ffn(ctx0, cur,
|
13047
|
+
lm_ggml_tensor * ffn_shexp = llm_build_ffn(ctx0, lctx, cur,
|
12917
13048
|
model.layers[il].ffn_up_shexp, NULL, NULL,
|
12918
13049
|
model.layers[il].ffn_gate_shexp, NULL, NULL,
|
12919
13050
|
model.layers[il].ffn_down_shexp, NULL, NULL,
|
@@ -12978,7 +13109,7 @@ struct llm_build_context {
|
|
12978
13109
|
// self-attention
|
12979
13110
|
{
|
12980
13111
|
// compute Q and K and RoPE them
|
12981
|
-
struct lm_ggml_tensor * Qcur =
|
13112
|
+
struct lm_ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
|
12982
13113
|
Qcur = lm_ggml_mul(ctx0, Qcur, model.layers[il].wq_scale);
|
12983
13114
|
cb(Qcur, "Qcur", il);
|
12984
13115
|
if (model.layers[il].bq) {
|
@@ -12987,7 +13118,7 @@ struct llm_build_context {
|
|
12987
13118
|
}
|
12988
13119
|
|
12989
13120
|
// B1.K
|
12990
|
-
struct lm_ggml_tensor * Kcur =
|
13121
|
+
struct lm_ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
|
12991
13122
|
Kcur = lm_ggml_mul(ctx0, Kcur, model.layers[il].wk_scale);
|
12992
13123
|
cb(Kcur, "Kcur", il);
|
12993
13124
|
if (model.layers[il].bk) {
|
@@ -12996,7 +13127,7 @@ struct llm_build_context {
|
|
12996
13127
|
}
|
12997
13128
|
|
12998
13129
|
// B1.V
|
12999
|
-
struct lm_ggml_tensor * Vcur =
|
13130
|
+
struct lm_ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
|
13000
13131
|
Vcur = lm_ggml_mul(ctx0, Vcur, model.layers[il].wv_scale);
|
13001
13132
|
cb(Vcur, "Vcur", il);
|
13002
13133
|
if (model.layers[il].bv) {
|
@@ -13018,7 +13149,7 @@ struct llm_build_context {
|
|
13018
13149
|
);
|
13019
13150
|
cb(Kcur, "Kcur", il);
|
13020
13151
|
|
13021
|
-
cur = llm_build_kv(ctx0,
|
13152
|
+
cur = llm_build_kv(ctx0, lctx, kv_self, gf,
|
13022
13153
|
NULL, NULL,
|
13023
13154
|
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
13024
13155
|
|
@@ -13027,7 +13158,7 @@ struct llm_build_context {
|
|
13027
13158
|
LLM_NORM_RMS, cb, il);
|
13028
13159
|
cb(cur, "attn_sub_norm", il);
|
13029
13160
|
|
13030
|
-
cur =
|
13161
|
+
cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wo, cur);
|
13031
13162
|
cur = lm_ggml_mul(ctx0, cur, model.layers[il].wo_scale);
|
13032
13163
|
if (model.layers[il].bo) {
|
13033
13164
|
cur = lm_ggml_add(ctx0, cur, model.layers[il].bo);
|
@@ -13051,7 +13182,7 @@ struct llm_build_context {
|
|
13051
13182
|
LLM_NORM_RMS, cb, il);
|
13052
13183
|
cb(cur, "ffn_norm", il);
|
13053
13184
|
|
13054
|
-
cur = llm_build_ffn(ctx0, cur,
|
13185
|
+
cur = llm_build_ffn(ctx0, lctx, cur,
|
13055
13186
|
model.layers[il].ffn_up, NULL, model.layers[il].ffn_up_scale,
|
13056
13187
|
model.layers[il].ffn_gate, NULL, model.layers[il].ffn_gate_scale,
|
13057
13188
|
NULL, NULL, NULL,
|
@@ -13064,7 +13195,7 @@ struct llm_build_context {
|
|
13064
13195
|
LLM_NORM_RMS, cb, il);
|
13065
13196
|
cb(cur, "ffn_sub_norm", il);
|
13066
13197
|
|
13067
|
-
cur =
|
13198
|
+
cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].ffn_down, cur);
|
13068
13199
|
cur = lm_ggml_mul(ctx0, cur, model.layers[il].ffn_down_scale);
|
13069
13200
|
cb(cur, "ffn_down", il);
|
13070
13201
|
|
@@ -13083,7 +13214,7 @@ struct llm_build_context {
|
|
13083
13214
|
cb(cur, "result_norm", -1);
|
13084
13215
|
|
13085
13216
|
// lm_head
|
13086
|
-
cur =
|
13217
|
+
cur = llm_build_lora_mm(lctx, ctx0, model.tok_embd, cur);
|
13087
13218
|
cb(cur, "result_output", -1);
|
13088
13219
|
|
13089
13220
|
lm_ggml_build_forward_expand(gf, cur);
|
@@ -13185,7 +13316,7 @@ struct llm_build_context {
|
|
13185
13316
|
cb(cur, "ffn_norm", il);
|
13186
13317
|
|
13187
13318
|
// T5 uses relu, flan-T5 uses gelu-gated
|
13188
|
-
cur = llm_build_ffn(ctx0, cur,
|
13319
|
+
cur = llm_build_ffn(ctx0, lctx, cur,
|
13189
13320
|
model.layers[il].ffn_up_enc, NULL, NULL,
|
13190
13321
|
model.layers[il].ffn_gate_enc, NULL, NULL,
|
13191
13322
|
model.layers[il].ffn_down_enc, NULL, NULL,
|
@@ -13365,7 +13496,7 @@ struct llm_build_context {
|
|
13365
13496
|
cb(cur, "ffn_norm", il);
|
13366
13497
|
|
13367
13498
|
// T5 uses relu, flan-T5 uses gelu-gated
|
13368
|
-
cur = llm_build_ffn(ctx0, cur,
|
13499
|
+
cur = llm_build_ffn(ctx0, lctx, cur,
|
13369
13500
|
model.layers[il].ffn_up, NULL, NULL,
|
13370
13501
|
model.layers[il].ffn_gate, NULL, NULL,
|
13371
13502
|
model.layers[il].ffn_down, NULL, NULL,
|
@@ -13431,7 +13562,7 @@ struct llm_build_context {
|
|
13431
13562
|
|
13432
13563
|
// self-attention
|
13433
13564
|
{
|
13434
|
-
cur =
|
13565
|
+
cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wqkv, cur);
|
13435
13566
|
cb(cur, "wqkv", il);
|
13436
13567
|
|
13437
13568
|
cur = lm_ggml_add(ctx0, cur, model.layers[il].bqkv);
|
@@ -13447,7 +13578,7 @@ struct llm_build_context {
|
|
13447
13578
|
|
13448
13579
|
Qcur = lm_ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
13449
13580
|
|
13450
|
-
cur = llm_build_kv(ctx0,
|
13581
|
+
cur = llm_build_kv(ctx0, lctx, kv_self, gf,
|
13451
13582
|
model.layers[il].wo, model.layers[il].bo,
|
13452
13583
|
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/float(n_embd_head), cb, il);
|
13453
13584
|
}
|
@@ -13471,7 +13602,7 @@ struct llm_build_context {
|
|
13471
13602
|
LLM_NORM, cb, il);
|
13472
13603
|
cb(cur, "ffn_norm", il);
|
13473
13604
|
|
13474
|
-
cur = llm_build_ffn(ctx0, cur,
|
13605
|
+
cur = llm_build_ffn(ctx0, lctx, cur,
|
13475
13606
|
model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
|
13476
13607
|
model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL,
|
13477
13608
|
model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
|
@@ -13490,7 +13621,7 @@ struct llm_build_context {
|
|
13490
13621
|
LLM_NORM, cb, -1);
|
13491
13622
|
cb(cur, "result_norm", -1);
|
13492
13623
|
|
13493
|
-
cur =
|
13624
|
+
cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
|
13494
13625
|
|
13495
13626
|
cb(cur, "result_output", -1);
|
13496
13627
|
|
@@ -13532,7 +13663,7 @@ struct llm_build_context {
|
|
13532
13663
|
struct lm_ggml_tensor * Kcur = nullptr;
|
13533
13664
|
struct lm_ggml_tensor * Vcur = nullptr;
|
13534
13665
|
|
13535
|
-
cur =
|
13666
|
+
cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wqkv, cur);
|
13536
13667
|
cb(cur, "wqkv", il);
|
13537
13668
|
|
13538
13669
|
cur = lm_ggml_add(ctx0, cur, model.layers[il].bqkv);
|
@@ -13560,7 +13691,7 @@ struct llm_build_context {
|
|
13560
13691
|
);
|
13561
13692
|
cb(Kcur, "Kcur_rope", il);
|
13562
13693
|
|
13563
|
-
cur = llm_build_kv(ctx0,
|
13694
|
+
cur = llm_build_kv(ctx0, lctx, kv_self, gf,
|
13564
13695
|
model.layers[il].wo, NULL,
|
13565
13696
|
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
13566
13697
|
|
@@ -13585,7 +13716,7 @@ struct llm_build_context {
|
|
13585
13716
|
LLM_NORM_RMS, cb, il);
|
13586
13717
|
cb(cur, "ffn_norm", il);
|
13587
13718
|
|
13588
|
-
cur = llm_build_ffn(ctx0, cur,
|
13719
|
+
cur = llm_build_ffn(ctx0, lctx, cur,
|
13589
13720
|
model.layers[il].ffn_up, NULL, NULL,
|
13590
13721
|
NULL, NULL, NULL,
|
13591
13722
|
model.layers[il].ffn_down, NULL, NULL,
|
@@ -13605,7 +13736,7 @@ struct llm_build_context {
|
|
13605
13736
|
LLM_NORM_RMS, cb, -1);
|
13606
13737
|
cb(cur, "result_norm", -1);
|
13607
13738
|
|
13608
|
-
cur =
|
13739
|
+
cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
|
13609
13740
|
cb(cur, "result_output", -1);
|
13610
13741
|
|
13611
13742
|
lm_ggml_build_forward_expand(gf, cur);
|
@@ -15032,6 +15163,10 @@ static void llama_kv_cache_update_internal(struct llama_context & lctx) {
|
|
15032
15163
|
|
15033
15164
|
// apply K-shift if needed
|
15034
15165
|
if (lctx.model.hparams.rope_type != LLAMA_ROPE_TYPE_NONE && lctx.kv_self.has_shift) {
|
15166
|
+
if (lctx.model.arch == LLM_ARCH_DEEPSEEK2) { // not supported due to MLA
|
15167
|
+
LM_GGML_ASSERT(false && "Deepseek2 does not support K-shift");
|
15168
|
+
}
|
15169
|
+
|
15035
15170
|
{
|
15036
15171
|
lm_ggml_backend_sched_reset(lctx.sched);
|
15037
15172
|
|
@@ -15426,6 +15561,8 @@ struct llm_tokenizer_bpe {
|
|
15426
15561
|
case LLAMA_VOCAB_PRE_TYPE_STARCODER:
|
15427
15562
|
case LLAMA_VOCAB_PRE_TYPE_REFACT:
|
15428
15563
|
case LLAMA_VOCAB_PRE_TYPE_COMMAND_R:
|
15564
|
+
case LLAMA_VOCAB_PRE_TYPE_SMOLLM:
|
15565
|
+
case LLAMA_VOCAB_PRE_TYPE_CODESHELL:
|
15429
15566
|
regex_exprs = {
|
15430
15567
|
"\\p{N}",
|
15431
15568
|
"'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
|
@@ -15463,6 +15600,13 @@ struct llm_tokenizer_bpe {
|
|
15463
15600
|
"\\p{N}",
|
15464
15601
|
};
|
15465
15602
|
break;
|
15603
|
+
case LLAMA_VOCAB_PRE_TYPE_TEKKEN:
|
15604
|
+
// original regex from tokenizer.json
|
15605
|
+
// "[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]*[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]+|[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]+[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]*|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+"
|
15606
|
+
regex_exprs = {
|
15607
|
+
"[^\\r\\n\\p{L}\\p{N}]?((?=[\\p{L}])([^a-z]))*((?=[\\p{L}])([^A-Z]))+|[^\\r\\n\\p{L}\\p{N}]?((?=[\\p{L}])([^a-z]))+((?=[\\p{L}])([^A-Z]))*|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
|
15608
|
+
};
|
15609
|
+
break;
|
15466
15610
|
default:
|
15467
15611
|
// default regex for BPE tokenization pre-processing
|
15468
15612
|
regex_exprs = {
|
@@ -17964,10 +18108,10 @@ static lm_ggml_type llama_tensor_get_type(quantize_state_internal & qs, lm_ggml_
|
|
17964
18108
|
// if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_S) new_type = LM_GGML_TYPE_Q4_K;
|
17965
18109
|
//}
|
17966
18110
|
bool convert_incompatible_tensor = false;
|
17967
|
-
if (new_type == LM_GGML_TYPE_Q2_K
|
17968
|
-
new_type == LM_GGML_TYPE_Q5_K
|
17969
|
-
new_type == LM_GGML_TYPE_IQ2_XS
|
17970
|
-
new_type == LM_GGML_TYPE_IQ3_XXS || new_type == LM_GGML_TYPE_IQ1_S
|
18111
|
+
if (new_type == LM_GGML_TYPE_Q2_K || new_type == LM_GGML_TYPE_Q3_K || new_type == LM_GGML_TYPE_Q4_K ||
|
18112
|
+
new_type == LM_GGML_TYPE_Q5_K || new_type == LM_GGML_TYPE_Q6_K || new_type == LM_GGML_TYPE_IQ4_XS ||
|
18113
|
+
new_type == LM_GGML_TYPE_IQ2_XS || new_type == LM_GGML_TYPE_IQ2_XXS || new_type == LM_GGML_TYPE_IQ2_S ||
|
18114
|
+
new_type == LM_GGML_TYPE_IQ3_XXS || new_type == LM_GGML_TYPE_IQ1_S || new_type == LM_GGML_TYPE_IQ3_S ||
|
17971
18115
|
new_type == LM_GGML_TYPE_IQ1_M) {
|
17972
18116
|
int nx = tensor->ne[0];
|
17973
18117
|
int ny = tensor->ne[1];
|
@@ -18153,8 +18297,9 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
18153
18297
|
|
18154
18298
|
// copy the KV pairs from the input file
|
18155
18299
|
lm_gguf_set_kv (ctx_out, ml.meta);
|
18156
|
-
lm_gguf_set_val_u32(ctx_out, "general.quantization_version", LM_GGML_QNT_VERSION);
|
18157
|
-
lm_gguf_set_val_u32(ctx_out, "general.file_type", ftype);
|
18300
|
+
lm_gguf_set_val_u32(ctx_out, "general.quantization_version", LM_GGML_QNT_VERSION); // TODO: use LLM_KV
|
18301
|
+
lm_gguf_set_val_u32(ctx_out, "general.file_type", ftype); // TODO: use LLM_KV
|
18302
|
+
|
18158
18303
|
// Remove split metadata
|
18159
18304
|
lm_gguf_remove_key(ctx_out, ml.llm_kv(LLM_KV_SPLIT_NO).c_str());
|
18160
18305
|
lm_gguf_remove_key(ctx_out, ml.llm_kv(LLM_KV_SPLIT_COUNT).c_str());
|
@@ -18469,282 +18614,210 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
18469
18614
|
}
|
18470
18615
|
}
|
18471
18616
|
|
18472
|
-
static
|
18473
|
-
|
18474
|
-
) {
|
18475
|
-
LLAMA_LOG_INFO("%s: applying lora adapter from '%s' - please wait ...\n", __func__, path_lora);
|
18476
|
-
|
18477
|
-
const int64_t t_start_lora_us = lm_ggml_time_us();
|
18478
|
-
|
18479
|
-
llama_file fin(path_lora, "rb");
|
18480
|
-
|
18481
|
-
// verify magic and version
|
18482
|
-
{
|
18483
|
-
uint32_t magic = fin.read_u32();
|
18484
|
-
if (magic != LLAMA_FILE_MAGIC_GGLA) {
|
18485
|
-
LLAMA_LOG_ERROR("%s: bad file magic\n", __func__);
|
18486
|
-
return 1;
|
18487
|
-
}
|
18617
|
+
static void llama_lora_adapter_init_internal(struct llama_model * model, const char * path_lora, struct llama_lora_adapter & adapter) {
|
18618
|
+
LLAMA_LOG_INFO("%s: loading lora adapter from '%s' ...\n", __func__, path_lora);
|
18488
18619
|
|
18489
|
-
|
18490
|
-
|
18491
|
-
|
18492
|
-
|
18493
|
-
}
|
18494
|
-
}
|
18495
|
-
|
18496
|
-
int32_t lora_r = fin.read_u32();
|
18497
|
-
int32_t lora_alpha = fin.read_u32();
|
18498
|
-
float scaling = scale * (float)lora_alpha / (float)lora_r;
|
18499
|
-
|
18500
|
-
LLAMA_LOG_INFO("%s: r = %d, alpha = %d, scaling = %.2f\n", __func__, lora_r, lora_alpha, scaling);
|
18501
|
-
|
18502
|
-
// load base model
|
18503
|
-
std::unique_ptr<llama_model_loader> ml;
|
18504
|
-
if (path_base_model) {
|
18505
|
-
LLAMA_LOG_INFO("%s: loading base model from '%s'\n", __func__, path_base_model);
|
18506
|
-
ml.reset(new llama_model_loader(path_base_model, /*use_mmap*/ true, /*check_tensors*/ false, /*kv_overrides*/ nullptr));
|
18507
|
-
ml->init_mappings(/*prefetch*/ false); // no prefetching
|
18508
|
-
}
|
18509
|
-
|
18510
|
-
struct tensor_meta {
|
18511
|
-
std::string name;
|
18512
|
-
lm_ggml_type type;
|
18513
|
-
int32_t ne[2];
|
18514
|
-
size_t offset;
|
18620
|
+
lm_ggml_context * ctx = nullptr;
|
18621
|
+
struct lm_gguf_init_params meta_lm_gguf_params = {
|
18622
|
+
/* .no_alloc = */ true,
|
18623
|
+
/* .ctx = */ &ctx,
|
18515
18624
|
};
|
18516
|
-
|
18517
|
-
|
18518
|
-
|
18519
|
-
|
18520
|
-
if (fin.tell() == fin.size) {
|
18521
|
-
// eof
|
18522
|
-
break;
|
18523
|
-
}
|
18524
|
-
|
18525
|
-
int32_t n_dims;
|
18526
|
-
int32_t name_len;
|
18527
|
-
int32_t ftype;
|
18528
|
-
|
18529
|
-
fin.read_raw(&n_dims, sizeof(n_dims));
|
18530
|
-
fin.read_raw(&name_len, sizeof(name_len));
|
18531
|
-
fin.read_raw(&ftype, sizeof(ftype));
|
18532
|
-
|
18533
|
-
if (n_dims != 1 && n_dims != 2) {
|
18534
|
-
LLAMA_LOG_ERROR("%s: unsupported tensor dimension %d\n", __func__, n_dims);
|
18535
|
-
return 1;
|
18536
|
-
}
|
18625
|
+
struct lm_gguf_context * ctx_gguf = lm_gguf_init_from_file(path_lora, meta_lm_gguf_params);
|
18626
|
+
if (!ctx_gguf) {
|
18627
|
+
throw std::runtime_error("failed to load lora adapter file from " + std::string(path_lora));
|
18628
|
+
}
|
18537
18629
|
|
18538
|
-
|
18539
|
-
|
18540
|
-
|
18541
|
-
|
18630
|
+
// check metadata
|
18631
|
+
{
|
18632
|
+
auto get_kv_str = [&](const std::string & key) -> std::string {
|
18633
|
+
int id = lm_gguf_find_key(ctx_gguf, key.c_str());
|
18634
|
+
return id < 0 ? "" : std::string(lm_gguf_get_val_str(ctx_gguf, id));
|
18635
|
+
};
|
18636
|
+
auto get_kv_f32 = [&](const std::string & key) -> float {
|
18637
|
+
int id = lm_gguf_find_key(ctx_gguf, key.c_str());
|
18638
|
+
return id < 0 ? 0.0f : lm_gguf_get_val_f32(ctx_gguf, id);
|
18639
|
+
};
|
18640
|
+
LLM_KV llm_kv = LLM_KV(LLM_ARCH_UNKNOWN);
|
18542
18641
|
|
18543
|
-
|
18544
|
-
{
|
18545
|
-
|
18546
|
-
|
18547
|
-
fin.read_raw(buf, name_len);
|
18548
|
-
name = std::string(buf, name_len);
|
18642
|
+
auto general_type = get_kv_str(llm_kv(LLM_KV_GENERAL_TYPE));
|
18643
|
+
if (general_type != "adapter") {
|
18644
|
+
lm_gguf_free(ctx_gguf);
|
18645
|
+
throw std::runtime_error("expect general.type to be 'adapter', but got: " + general_type);
|
18549
18646
|
}
|
18550
18647
|
|
18551
|
-
|
18552
|
-
|
18553
|
-
if (
|
18554
|
-
|
18555
|
-
|
18556
|
-
if (lora_suffix != ".loraA" && lora_suffix != ".loraB") {
|
18557
|
-
LLAMA_LOG_ERROR("%s: error: '%s' is not a lora tensor\n", __func__, name.c_str());
|
18558
|
-
return 1;
|
18648
|
+
auto general_arch_str = get_kv_str(llm_kv(LLM_KV_GENERAL_ARCHITECTURE));
|
18649
|
+
auto general_arch = llm_arch_from_string(general_arch_str);
|
18650
|
+
if (general_arch != model->arch) {
|
18651
|
+
lm_gguf_free(ctx_gguf);
|
18652
|
+
throw std::runtime_error("model arch and LoRA arch mismatch");
|
18559
18653
|
}
|
18560
18654
|
|
18561
|
-
|
18562
|
-
|
18563
|
-
|
18564
|
-
|
18565
|
-
case 1: wtype = LM_GGML_TYPE_F16; break;
|
18566
|
-
default:
|
18567
|
-
{
|
18568
|
-
LLAMA_LOG_ERROR("%s: invalid tensor data type '%d'\n",
|
18569
|
-
__func__, ftype);
|
18570
|
-
return 1;
|
18571
|
-
}
|
18655
|
+
auto adapter_type = get_kv_str(llm_kv(LLM_KV_ADAPTER_TYPE));
|
18656
|
+
if (adapter_type != "lora") {
|
18657
|
+
lm_gguf_free(ctx_gguf);
|
18658
|
+
throw std::runtime_error("expect adapter.type to be 'lora', but got: " + adapter_type);
|
18572
18659
|
}
|
18573
18660
|
|
18574
|
-
|
18575
|
-
size_t offset = fin.tell();
|
18576
|
-
offset = (offset + 31) & -32;
|
18577
|
-
|
18578
|
-
// skip tensor data
|
18579
|
-
fin.seek(offset + lm_ggml_row_size(wtype, ne[0]) * ne[1], SEEK_SET);
|
18580
|
-
|
18581
|
-
tensor_meta_map.emplace(name, tensor_meta{ name, wtype, { ne[0], ne[1] }, offset });
|
18661
|
+
adapter.alpha = get_kv_f32(llm_kv(LLM_KV_ADAPTER_LORA_ALPHA));
|
18582
18662
|
}
|
18583
18663
|
|
18584
|
-
|
18585
|
-
int n_tensors = 0;
|
18586
|
-
|
18587
|
-
// apply
|
18588
|
-
lm_ggml_backend_t backend_cpu = lm_ggml_backend_cpu_init();
|
18589
|
-
if (backend_cpu == nullptr) {
|
18590
|
-
LLAMA_LOG_ERROR("%s: error: failed to initialize cpu backend\n", __func__);
|
18591
|
-
return 1;
|
18592
|
-
}
|
18593
|
-
lm_ggml_backend_cpu_set_n_threads(backend_cpu, n_threads);
|
18594
|
-
|
18595
|
-
std::vector<no_init<uint8_t>> read_buf;
|
18596
|
-
for (const auto & it : model.tensors_by_name) {
|
18597
|
-
const std::string & base_name = it.first;
|
18598
|
-
lm_ggml_tensor * model_t = it.second;
|
18599
|
-
|
18600
|
-
if (tensor_meta_map.find(base_name + ".loraA") == tensor_meta_map.end() ||
|
18601
|
-
tensor_meta_map.find(base_name + ".loraB") == tensor_meta_map.end()) {
|
18602
|
-
continue;
|
18603
|
-
}
|
18604
|
-
|
18605
|
-
tensor_meta & metaA = tensor_meta_map.at(base_name + ".loraA");
|
18606
|
-
tensor_meta & metaB = tensor_meta_map.at(base_name + ".loraB");
|
18664
|
+
int n_tensors = lm_gguf_get_n_tensors(ctx_gguf);
|
18607
18665
|
|
18608
|
-
|
18609
|
-
|
18610
|
-
|
18611
|
-
|
18666
|
+
// contexts for each buffer type
|
18667
|
+
std::map<lm_ggml_backend_buffer_type_t, lm_ggml_context *> ctx_map;
|
18668
|
+
auto get_ctx_for_buft = [&](lm_ggml_backend_buffer_type_t buft) -> lm_ggml_context * {
|
18669
|
+
auto it = ctx_map.find(buft);
|
18670
|
+
if (it == ctx_map.end()) {
|
18671
|
+
// add a new context
|
18672
|
+
struct lm_ggml_init_params params = {
|
18673
|
+
/*.mem_size =*/ n_tensors*lm_ggml_tensor_overhead(),
|
18674
|
+
/*.mem_buffer =*/ NULL,
|
18675
|
+
/*.no_alloc =*/ true,
|
18676
|
+
};
|
18677
|
+
lm_ggml_context * buft_ctx = lm_ggml_init(params);
|
18678
|
+
ctx_map[buft] = buft_ctx;
|
18679
|
+
return buft_ctx;
|
18612
18680
|
};
|
18613
|
-
|
18614
|
-
|
18615
|
-
LLAMA_LOG_ERROR("%s: error: failed to initialize lora context\n", __func__);
|
18616
|
-
lm_ggml_backend_free(backend_cpu);
|
18617
|
-
return 1;
|
18618
|
-
}
|
18619
|
-
|
18620
|
-
// create tensors
|
18621
|
-
lm_ggml_tensor * loraA = lm_ggml_new_tensor_2d(lora_ctx, metaA.type, metaA.ne[0], metaA.ne[1]);
|
18622
|
-
lm_ggml_tensor * loraB = lm_ggml_new_tensor_2d(lora_ctx, metaB.type, metaB.ne[0], metaB.ne[1]);
|
18623
|
-
lm_ggml_set_name(loraA, metaA.name.c_str());
|
18624
|
-
lm_ggml_set_name(loraB, metaB.name.c_str());
|
18681
|
+
return it->second;
|
18682
|
+
};
|
18625
18683
|
|
18626
|
-
|
18627
|
-
|
18628
|
-
|
18629
|
-
|
18630
|
-
|
18684
|
+
// bundle lora_a and lora_b into pairs
|
18685
|
+
std::map<std::string, llama_lora_weight> ab_map;
|
18686
|
+
auto str_endswith = [](const std::string & str, const std::string & suffix) {
|
18687
|
+
return str.size() >= suffix.size() && str.compare(str.size()-suffix.size(), suffix.size(), suffix) == 0;
|
18688
|
+
};
|
18689
|
+
for (lm_ggml_tensor * cur = lm_ggml_get_first_tensor(ctx); cur; cur = lm_ggml_get_next_tensor(ctx, cur)) {
|
18690
|
+
std::string name(cur->name);
|
18691
|
+
if (str_endswith(name, ".lora_a")) {
|
18692
|
+
replace_all(name, ".lora_a", "");
|
18693
|
+
if (ab_map.find(name) == ab_map.end()) {
|
18694
|
+
ab_map[name] = llama_lora_weight(cur, nullptr);
|
18695
|
+
} else {
|
18696
|
+
ab_map[name].a = cur;
|
18697
|
+
}
|
18698
|
+
} else if (str_endswith(name, ".lora_b")) {
|
18699
|
+
replace_all(name, ".lora_b", "");
|
18700
|
+
if (ab_map.find(name) == ab_map.end()) {
|
18701
|
+
ab_map[name] = llama_lora_weight(nullptr, cur);
|
18702
|
+
} else {
|
18703
|
+
ab_map[name].b = cur;
|
18631
18704
|
}
|
18632
|
-
base_t = lm_ggml_dup_tensor(lora_ctx, ml->get_tensor_meta(base_name.c_str()));
|
18633
18705
|
} else {
|
18634
|
-
|
18635
|
-
|
18636
|
-
|
18637
|
-
|
18638
|
-
// allocate in backend buffer
|
18639
|
-
lm_ggml_backend_buffer_t lora_buf = lm_ggml_backend_alloc_ctx_tensors_from_buft(lora_ctx, lm_ggml_backend_cpu_buffer_type());
|
18640
|
-
if (lora_buf == nullptr) {
|
18641
|
-
LLAMA_LOG_ERROR("%s: error: failed to allocate lora tensors\n", __func__);
|
18642
|
-
return 1;
|
18706
|
+
lm_gguf_free(ctx_gguf);
|
18707
|
+
lm_ggml_free(ctx);
|
18708
|
+
throw std::runtime_error("LoRA tensor '" + name + "' has unexpected suffix");
|
18643
18709
|
}
|
18710
|
+
}
|
18644
18711
|
|
18645
|
-
|
18646
|
-
|
18647
|
-
|
18648
|
-
|
18649
|
-
fin.read_raw(read_buf.data(), lm_ggml_nbytes(tensor));
|
18650
|
-
lm_ggml_backend_tensor_set(tensor, read_buf.data(), 0, read_buf.size());
|
18651
|
-
};
|
18652
|
-
load_tensor(metaA, loraA);
|
18653
|
-
load_tensor(metaB, loraB);
|
18712
|
+
// add tensors
|
18713
|
+
for (auto & it : ab_map) {
|
18714
|
+
const std::string & name = it.first;
|
18715
|
+
llama_lora_weight & w = it.second;
|
18654
18716
|
|
18655
|
-
|
18656
|
-
|
18657
|
-
|
18658
|
-
|
18659
|
-
lm_ggml_backend_tensor_copy(model_t, base_t);
|
18717
|
+
if (!w.a || !w.b) {
|
18718
|
+
lm_gguf_free(ctx_gguf);
|
18719
|
+
lm_ggml_free(ctx);
|
18720
|
+
throw std::runtime_error("LoRA tensor pair for '" + name + "' is missing one component");
|
18660
18721
|
}
|
18661
18722
|
|
18662
|
-
|
18663
|
-
|
18664
|
-
|
18665
|
-
|
18723
|
+
// device buft and device ctx
|
18724
|
+
auto * model_tensor = llama_get_model_tensor(model, name.c_str());
|
18725
|
+
if (!model_tensor) {
|
18726
|
+
lm_gguf_free(ctx_gguf);
|
18727
|
+
lm_ggml_free(ctx);
|
18728
|
+
throw std::runtime_error("LoRA tensor '" + name + "' does not exist in base model");
|
18666
18729
|
}
|
18667
|
-
|
18668
|
-
|
18669
|
-
|
18670
|
-
|
18671
|
-
lm_ggml_free(
|
18672
|
-
|
18673
|
-
lm_ggml_backend_free(backend_cpu);
|
18674
|
-
return 1;
|
18730
|
+
struct lm_ggml_context * dev_ctx = get_ctx_for_buft(lm_ggml_backend_buffer_get_type(model_tensor->buffer));
|
18731
|
+
// validate tensor shape
|
18732
|
+
if (model_tensor->ne[0] != w.a->ne[0] || model_tensor->ne[1] != w.b->ne[1]) {
|
18733
|
+
lm_gguf_free(ctx_gguf);
|
18734
|
+
lm_ggml_free(ctx);
|
18735
|
+
throw std::runtime_error("tensor '" + name + "' has incorrect shape");
|
18675
18736
|
}
|
18737
|
+
if (w.a->ne[1] != w.b->ne[0]) {
|
18738
|
+
lm_gguf_free(ctx_gguf);
|
18739
|
+
lm_ggml_free(ctx);
|
18740
|
+
throw std::runtime_error("lora_a tensor is not transposed (hint: adapter from \"finetune\" example is no longer supported)");
|
18741
|
+
}
|
18742
|
+
// save tensor to adapter
|
18743
|
+
struct lm_ggml_tensor * tensor_a = lm_ggml_dup_tensor(dev_ctx, w.a);
|
18744
|
+
struct lm_ggml_tensor * tensor_b = lm_ggml_dup_tensor(dev_ctx, w.b);
|
18745
|
+
lm_ggml_set_name(tensor_a, w.a->name);
|
18746
|
+
lm_ggml_set_name(tensor_b, w.b->name);
|
18747
|
+
adapter.ab_map[name] = llama_lora_weight(tensor_a, tensor_b);
|
18748
|
+
}
|
18676
18749
|
|
18677
|
-
|
18678
|
-
|
18679
|
-
|
18680
|
-
|
18681
|
-
|
18682
|
-
|
18683
|
-
|
18684
|
-
|
18685
|
-
|
18686
|
-
|
18687
|
-
|
18688
|
-
|
18689
|
-
lm_ggml_set_name(r, "r_add");
|
18690
|
-
|
18691
|
-
if (base_t->type != model_t->type) {
|
18692
|
-
// convert the result to the model type
|
18693
|
-
r = lm_ggml_cast(lora_ctx, r, model_t->type);
|
18694
|
-
lm_ggml_set_name(r, "r_cast");
|
18750
|
+
// allocate tensors / buffers and zero
|
18751
|
+
{
|
18752
|
+
adapter.ctxs.reserve(ctx_map.size());
|
18753
|
+
adapter.bufs.reserve(ctx_map.size());
|
18754
|
+
for (auto it : ctx_map) {
|
18755
|
+
lm_ggml_backend_buffer_type_t buft = it.first;
|
18756
|
+
lm_ggml_context * ctx_dev = it.second;
|
18757
|
+
lm_ggml_backend_buffer_t buf = lm_ggml_backend_alloc_ctx_tensors_from_buft(ctx_dev, buft);
|
18758
|
+
if (!buf) {
|
18759
|
+
lm_gguf_free(ctx_gguf);
|
18760
|
+
lm_ggml_free(ctx);
|
18761
|
+
throw std::runtime_error("failed to allocate buffer for lora adapter\n");
|
18695
18762
|
}
|
18763
|
+
LLAMA_LOG_INFO("%s: %10s LoRA buffer size = %8.2f MiB\n", __func__, lm_ggml_backend_buffer_name(buf), lm_ggml_backend_buffer_get_size(buf)/1024.0/1024.0);
|
18764
|
+
adapter.ctxs.push_back(ctx_dev);
|
18765
|
+
adapter.bufs.push_back(buf);
|
18766
|
+
}
|
18767
|
+
}
|
18696
18768
|
|
18697
|
-
|
18769
|
+
// set tensor data
|
18770
|
+
{
|
18771
|
+
llama_file lm_gguf_file(path_lora, "rb");
|
18772
|
+
std::vector<uint8_t> read_buf;
|
18773
|
+
auto set_tensor = [&](struct lm_ggml_tensor * orig, struct lm_ggml_tensor * dev) {
|
18774
|
+
size_t offs = lm_gguf_get_data_offset(ctx_gguf) + lm_gguf_get_tensor_offset(ctx_gguf, lm_gguf_find_tensor(ctx_gguf, orig->name));
|
18775
|
+
size_t size = lm_ggml_nbytes(orig);
|
18776
|
+
read_buf.resize(size);
|
18777
|
+
lm_gguf_file.seek(offs, SEEK_SET);
|
18778
|
+
lm_gguf_file.read_raw(read_buf.data(), size);
|
18779
|
+
lm_ggml_backend_tensor_set(dev, read_buf.data(), 0, size);
|
18698
18780
|
};
|
18699
|
-
|
18700
|
-
|
18701
|
-
|
18702
|
-
|
18703
|
-
|
18704
|
-
lm_ggml_backend_buffer_t graph_buf = lm_ggml_backend_alloc_ctx_tensors_from_buft(lora_ctx, lm_ggml_backend_cpu_buffer_type());
|
18705
|
-
if (graph_buf == nullptr) {
|
18706
|
-
LLAMA_LOG_ERROR("%s: error: failed to allocate graph tensors\n", __func__);
|
18707
|
-
lm_ggml_free(lora_ctx);
|
18708
|
-
lm_ggml_backend_buffer_free(lora_buf);
|
18709
|
-
lm_ggml_backend_free(backend_cpu);
|
18710
|
-
return 1;
|
18781
|
+
for (auto & it : adapter.ab_map) {
|
18782
|
+
auto orig = ab_map[it.first];
|
18783
|
+
auto dev = it.second;
|
18784
|
+
set_tensor(orig.a, dev.a);
|
18785
|
+
set_tensor(orig.b, dev.b);
|
18711
18786
|
}
|
18787
|
+
}
|
18712
18788
|
|
18713
|
-
|
18714
|
-
|
18715
|
-
lm_ggml_backend_tensor_set(model_t, r->data, 0, lm_ggml_nbytes(r));
|
18716
|
-
|
18717
|
-
#if 0
|
18718
|
-
// TODO: use scheduler with fallback to CPU for less copies between CPU and GPU
|
18719
|
-
//lm_ggml_backend_sched_t sched = lm_ggml_backend_sched_new(backends.data(), backends.size(), LM_GGML_DEFAULT_GRAPH_SIZE);
|
18720
|
-
|
18721
|
-
// sched compute
|
18722
|
-
lm_ggml_build_forward_expand(gf, build_graph());
|
18723
|
-
lm_ggml_backend_sched_init_measure(sched, gf);
|
18724
|
-
|
18725
|
-
// create the graph again, since the previous one was destroyed by the measure
|
18726
|
-
lm_ggml_graph_clear(gf);
|
18727
|
-
lm_ggml_build_forward_expand(gf, build_graph());
|
18728
|
-
lm_ggml_backend_sched_graph_compute(sched, gf);
|
18729
|
-
lm_ggml_backend_sched_free(sched);
|
18730
|
-
#endif
|
18789
|
+
LLAMA_LOG_INFO("%s: loaded %ld tensors from lora file\n", __func__, adapter.ab_map.size()*2);
|
18731
18790
|
|
18732
|
-
|
18733
|
-
|
18734
|
-
|
18791
|
+
// free ctx for reading gguf
|
18792
|
+
lm_gguf_free(ctx_gguf);
|
18793
|
+
lm_ggml_free(ctx);
|
18794
|
+
}
|
18735
18795
|
|
18736
|
-
|
18737
|
-
|
18738
|
-
|
18739
|
-
|
18796
|
+
int32_t llama_lora_adapter_set(
|
18797
|
+
struct llama_context * ctx,
|
18798
|
+
struct llama_lora_adapter * adapter,
|
18799
|
+
float scale) {
|
18800
|
+
if (ctx->cparams.flash_attn) {
|
18801
|
+
LLAMA_LOG_ERROR("%s: flash_attn is not compatible with LoRA\n", __func__);
|
18802
|
+
return -1;
|
18740
18803
|
}
|
18804
|
+
ctx->lora_adapters[adapter] = scale;
|
18805
|
+
return 0;
|
18806
|
+
}
|
18741
18807
|
|
18742
|
-
|
18743
|
-
|
18744
|
-
|
18745
|
-
|
18808
|
+
int32_t llama_lora_adapter_remove(
|
18809
|
+
struct llama_context * ctx,
|
18810
|
+
struct llama_lora_adapter * adapter) {
|
18811
|
+
auto pos = ctx->lora_adapters.find(adapter);
|
18812
|
+
if (pos != ctx->lora_adapters.end()) {
|
18813
|
+
ctx->lora_adapters.erase(pos);
|
18814
|
+
return 0;
|
18815
|
+
}
|
18816
|
+
return -1;
|
18817
|
+
}
|
18746
18818
|
|
18747
|
-
|
18819
|
+
void llama_lora_adapter_free(struct llama_lora_adapter * adapter) {
|
18820
|
+
delete adapter;
|
18748
18821
|
}
|
18749
18822
|
|
18750
18823
|
//
|
@@ -18838,6 +18911,8 @@ size_t llama_max_devices(void) {
|
|
18838
18911
|
return LM_GGML_SYCL_MAX_DEVICES;
|
18839
18912
|
#elif defined(LM_GGML_USE_VULKAN)
|
18840
18913
|
return LM_GGML_VK_MAX_DEVICES;
|
18914
|
+
#elif defined(LM_GGML_USE_CANN)
|
18915
|
+
return LM_GGML_CANN_MAX_DEVICES;
|
18841
18916
|
#else
|
18842
18917
|
return 1;
|
18843
18918
|
#endif
|
@@ -19179,6 +19254,30 @@ struct llama_context * llama_new_context_with_model(
|
|
19179
19254
|
}
|
19180
19255
|
ctx->backends.push_back(backend);
|
19181
19256
|
}
|
19257
|
+
#elif defined(LM_GGML_USE_CANN)
|
19258
|
+
// with split_mode LLAMA_SPLIT_MODE_NONE or LLAMA_SPLIT_MODE_ROW, only the main GPU backend is used
|
19259
|
+
// TODO: lm_ggml_backend_cann is not support split tensor now, just leave code here.
|
19260
|
+
if (model->split_mode == LLAMA_SPLIT_MODE_NONE || model->split_mode == LLAMA_SPLIT_MODE_ROW) {
|
19261
|
+
lm_ggml_backend_t backend = lm_ggml_backend_cann_init(model->main_gpu);
|
19262
|
+
if (backend == nullptr) {
|
19263
|
+
LLAMA_LOG_ERROR("%s: failed to initialize CANN%d backend\n", __func__, model->main_gpu);
|
19264
|
+
llama_free(ctx);
|
19265
|
+
return nullptr;
|
19266
|
+
}
|
19267
|
+
ctx->backends.push_back(backend);
|
19268
|
+
} else {
|
19269
|
+
// LLAMA_SPLIT_MODE_LAYER requires a backend for each GPU
|
19270
|
+
// TODO: currently, CANN can't use multi-gpus, just leave code here for further cann version.
|
19271
|
+
for (int32_t device = 0; device < lm_ggml_backend_cann_get_device_count(); ++device) {
|
19272
|
+
lm_ggml_backend_t backend = lm_ggml_backend_cann_init(device);
|
19273
|
+
if (backend == nullptr) {
|
19274
|
+
LLAMA_LOG_ERROR("%s: failed to initialize CANN%d backend\n", __func__, device);
|
19275
|
+
llama_free(ctx);
|
19276
|
+
return nullptr;
|
19277
|
+
}
|
19278
|
+
ctx->backends.push_back(backend);
|
19279
|
+
}
|
19280
|
+
}
|
19182
19281
|
#endif
|
19183
19282
|
|
19184
19283
|
#ifdef LM_GGML_USE_BLAS
|
@@ -19363,7 +19462,6 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
|
|
19363
19462
|
case LLM_ARCH_BAICHUAN:
|
19364
19463
|
case LLM_ARCH_STARCODER:
|
19365
19464
|
case LLM_ARCH_PLAMO:
|
19366
|
-
case LLM_ARCH_CODESHELL:
|
19367
19465
|
case LLM_ARCH_ORION:
|
19368
19466
|
case LLM_ARCH_INTERNLM2:
|
19369
19467
|
case LLM_ARCH_MINICPM:
|
@@ -19393,6 +19491,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
|
|
19393
19491
|
case LLM_ARCH_STARCODER2:
|
19394
19492
|
case LLM_ARCH_OPENELM:
|
19395
19493
|
case LLM_ARCH_GPTNEOX:
|
19494
|
+
case LLM_ARCH_CODESHELL:
|
19396
19495
|
return LLAMA_ROPE_TYPE_NEOX;
|
19397
19496
|
|
19398
19497
|
// all model arches should be listed explicitly here
|
@@ -19525,12 +19624,14 @@ uint32_t llama_model_quantize(
|
|
19525
19624
|
}
|
19526
19625
|
}
|
19527
19626
|
|
19528
|
-
|
19627
|
+
struct llama_lora_adapter * llama_lora_adapter_init(struct llama_model * model, const char * path_lora) {
|
19529
19628
|
try {
|
19530
|
-
|
19629
|
+
struct llama_lora_adapter * adapter = new llama_lora_adapter(model);
|
19630
|
+
llama_lora_adapter_init_internal(model, path_lora, *adapter);
|
19631
|
+
return adapter;
|
19531
19632
|
} catch (const std::exception & err) {
|
19532
19633
|
LLAMA_LOG_ERROR("%s: failed to apply lora adapter: %s\n", __func__, err.what());
|
19533
|
-
return
|
19634
|
+
return nullptr;
|
19534
19635
|
}
|
19535
19636
|
}
|
19536
19637
|
|
@@ -19846,7 +19947,7 @@ size_t llama_state_get_size(const struct llama_context * ctx) {
|
|
19846
19947
|
);
|
19847
19948
|
|
19848
19949
|
// on session change it is very likely that the state size has changed - so we need to update this function
|
19849
|
-
static_assert(LLAMA_SESSION_VERSION ==
|
19950
|
+
static_assert(LLAMA_SESSION_VERSION == 7, "So you just bumped the session version - good. But did you remember to update llama_state_get_size?");
|
19850
19951
|
|
19851
19952
|
return s_total;
|
19852
19953
|
}
|
@@ -21533,7 +21634,7 @@ static int32_t llama_chat_apply_template_internal(
|
|
21533
21634
|
if (add_ass) {
|
21534
21635
|
ss << "<|assistant|>";
|
21535
21636
|
}
|
21536
|
-
} else if (tmpl == "
|
21637
|
+
} else if (tmpl == "chatglm4" || tmpl_contains("[gMASK]<sop>")) {
|
21537
21638
|
ss << "[gMASK]" << "<sop>";
|
21538
21639
|
for (auto message : chat) {
|
21539
21640
|
std::string role(message->role);
|
@@ -21754,6 +21855,8 @@ void llama_log_set(lm_ggml_log_callback log_callback, void * user_data) {
|
|
21754
21855
|
lm_ggml_backend_metal_log_set_callback(g_state.log_callback, g_state.log_callback_user_data);
|
21755
21856
|
#elif defined(LM_GGML_USE_CUDA)
|
21756
21857
|
lm_ggml_backend_cuda_log_set_callback(g_state.log_callback, g_state.log_callback_user_data);
|
21858
|
+
#elif defined(LM_GGML_USE_CANN)
|
21859
|
+
lm_ggml_backend_cann_log_set_callback(g_state.log_callback, g_state.log_callback_user_data);
|
21757
21860
|
#endif
|
21758
21861
|
}
|
21759
21862
|
|