cui-llama.rn 1.0.1 → 1.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/cpp/llama.cpp CHANGED
@@ -19,6 +19,8 @@
19
19
  # include "ggml-sycl.h"
20
20
  #elif defined(LM_GGML_USE_KOMPUTE)
21
21
  # include "ggml-kompute.h"
22
+ #elif defined(LM_GGML_USE_CANN)
23
+ # include "ggml-cann.h"
22
24
  #endif
23
25
 
24
26
  #ifdef LM_GGML_USE_BLAS
@@ -112,7 +114,7 @@
112
114
 
113
115
  // bump if necessary
114
116
  #define LLAMA_MAX_NODES 8192
115
- #define LLAMA_MAX_LAYERS 256
117
+ #define LLAMA_MAX_LAYERS 512
116
118
  #define LLAMA_MAX_EXPERTS 160 // DeepSeekV2
117
119
 
118
120
  //
@@ -298,6 +300,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
298
300
  };
299
301
 
300
302
  enum llm_kv {
303
+ LLM_KV_GENERAL_TYPE,
301
304
  LLM_KV_GENERAL_ARCHITECTURE,
302
305
  LLM_KV_GENERAL_QUANTIZATION_VERSION,
303
306
  LLM_KV_GENERAL_ALIGNMENT,
@@ -388,9 +391,13 @@ enum llm_kv {
388
391
  LLM_KV_TOKENIZER_SUFFIX_ID,
389
392
  LLM_KV_TOKENIZER_MIDDLE_ID,
390
393
  LLM_KV_TOKENIZER_EOT_ID,
394
+
395
+ LLM_KV_ADAPTER_TYPE,
396
+ LLM_KV_ADAPTER_LORA_ALPHA,
391
397
  };
392
398
 
393
399
  static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
400
+ { LLM_KV_GENERAL_TYPE, "general.type" },
394
401
  { LLM_KV_GENERAL_ARCHITECTURE, "general.architecture" },
395
402
  { LLM_KV_GENERAL_QUANTIZATION_VERSION, "general.quantization_version" },
396
403
  { LLM_KV_GENERAL_ALIGNMENT, "general.alignment" },
@@ -481,6 +488,9 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
481
488
  { LLM_KV_TOKENIZER_SUFFIX_ID, "tokenizer.ggml.suffix_token_id" },
482
489
  { LLM_KV_TOKENIZER_MIDDLE_ID, "tokenizer.ggml.middle_token_id" },
483
490
  { LLM_KV_TOKENIZER_EOT_ID, "tokenizer.ggml.eot_token_id" },
491
+
492
+ { LLM_KV_ADAPTER_TYPE, "adapter.type" },
493
+ { LLM_KV_ADAPTER_LORA_ALPHA, "adapter.lora.alpha" },
484
494
  };
485
495
 
486
496
  struct LLM_KV {
@@ -2082,6 +2092,8 @@ struct llama_state {
2082
2092
  lm_ggml_backend_metal_log_set_callback(log_callback, log_callback_user_data);
2083
2093
  #elif defined(LM_GGML_USE_CUDA)
2084
2094
  lm_ggml_backend_cuda_log_set_callback(log_callback, log_callback_user_data);
2095
+ #elif defined(LM_GGML_USE_CANN)
2096
+ lm_ggml_backend_cann_log_set_callback(log_callback, log_callback_user_data);
2085
2097
  #endif
2086
2098
  }
2087
2099
 
@@ -2714,6 +2726,9 @@ struct llama_model {
2714
2726
  int64_t t_load_us = 0;
2715
2727
  int64_t t_start_us = 0;
2716
2728
 
2729
+ // keep track of loaded lora adapters
2730
+ std::set<struct llama_lora_adapter *> lora_adapters;
2731
+
2717
2732
  ~llama_model() {
2718
2733
  for (struct lm_ggml_context * ctx : ctxs) {
2719
2734
  lm_ggml_free(ctx);
@@ -2726,6 +2741,9 @@ struct llama_model {
2726
2741
  #endif
2727
2742
  lm_ggml_backend_buffer_free(buf);
2728
2743
  }
2744
+ while (!lora_adapters.empty()) {
2745
+ llama_lora_adapter_free(*lora_adapters.begin());
2746
+ }
2729
2747
  }
2730
2748
  };
2731
2749
 
@@ -2830,6 +2848,52 @@ struct llama_context {
2830
2848
 
2831
2849
  // control vectors
2832
2850
  struct llama_control_vector cvec;
2851
+
2852
+ // lora adapters and scales
2853
+ std::unordered_map<struct llama_lora_adapter *, float> lora_adapters;
2854
+ };
2855
+
2856
+ struct llama_lora_weight {
2857
+ struct lm_ggml_tensor * a = nullptr;
2858
+ struct lm_ggml_tensor * b = nullptr;
2859
+ llama_lora_weight() = default;
2860
+ llama_lora_weight(struct lm_ggml_tensor * a, struct lm_ggml_tensor * b): a(a), b(b) {}
2861
+ };
2862
+
2863
+ struct llama_lora_adapter {
2864
+ struct llama_model * base_model;
2865
+ // map tensor name to lora_a_b
2866
+ std::unordered_map<std::string, struct llama_lora_weight> ab_map;
2867
+ std::vector<struct lm_ggml_context *> ctxs;
2868
+ std::vector<lm_ggml_backend_buffer_t> bufs;
2869
+
2870
+ float alpha;
2871
+
2872
+ llama_lora_adapter(struct llama_model * base_model): base_model(base_model) {
2873
+ base_model->lora_adapters.insert(this);
2874
+ }
2875
+
2876
+ llama_lora_weight * get_weight(struct lm_ggml_tensor * w) {
2877
+ std::string name(w->name);
2878
+ auto pos = ab_map.find(name);
2879
+ if (ab_map.find(name) != ab_map.end()) {
2880
+ return &pos->second;
2881
+ }
2882
+ return nullptr;
2883
+ }
2884
+
2885
+ ~llama_lora_adapter() {
2886
+ for (struct lm_ggml_context * ctx : ctxs) {
2887
+ lm_ggml_free(ctx);
2888
+ }
2889
+ for (lm_ggml_backend_buffer_t buf : bufs) {
2890
+ lm_ggml_backend_buffer_free(buf);
2891
+ }
2892
+ auto pos = base_model->lora_adapters.find(this);
2893
+ if (pos != base_model->lora_adapters.end()) {
2894
+ base_model->lora_adapters.erase(pos);
2895
+ }
2896
+ }
2833
2897
  };
2834
2898
 
2835
2899
  static size_t llama_get_device_count(const llama_model & model) {
@@ -2840,6 +2904,8 @@ static size_t llama_get_device_count(const llama_model & model) {
2840
2904
  count = lm_ggml_backend_sycl_get_device_count();
2841
2905
  #elif defined(LM_GGML_USE_VULKAN)
2842
2906
  count = lm_ggml_backend_vk_get_device_count();
2907
+ #elif defined(LM_GGML_USE_CANN)
2908
+ return lm_ggml_backend_cann_get_device_count();
2843
2909
  #endif
2844
2910
  #if defined(LM_GGML_USE_RPC)
2845
2911
  count += model.rpc_servers.size();
@@ -2872,6 +2938,8 @@ static lm_ggml_backend_buffer_type_t llama_default_buffer_type_offload(const lla
2872
2938
  if (buft == nullptr) {
2873
2939
  LLAMA_LOG_WARN("%s: cannot use GPU %d, check `vulkaninfo --summary`\n", __func__, gpu);
2874
2940
  }
2941
+ #elif defined(LM_GGML_USE_CANN)
2942
+ buft = lm_ggml_backend_cann_buffer_type(gpu);
2875
2943
  #endif
2876
2944
 
2877
2945
  if (buft == nullptr) {
@@ -2932,6 +3000,11 @@ static size_t llama_get_device_memory(const llama_model & model, int device) {
2932
3000
  size_t free;
2933
3001
  lm_ggml_backend_vk_get_device_memory(device, &free, &total);
2934
3002
  return free;
3003
+ #elif defined(LM_GGML_USE_CANN)
3004
+ size_t total;
3005
+ size_t free;
3006
+ lm_ggml_backend_cann_get_device_memory(device, &total, &free);
3007
+ return free;
2935
3008
  #else
2936
3009
  return 1;
2937
3010
  #endif
@@ -3645,7 +3718,7 @@ struct llama_model_loader {
3645
3718
  }
3646
3719
 
3647
3720
  if (param_overrides_p != nullptr) {
3648
- for (const struct llama_model_kv_override *p = param_overrides_p; p->key[0] != 0; p++) {
3721
+ for (const struct llama_model_kv_override * p = param_overrides_p; p->key[0] != 0; p++) {
3649
3722
  kv_overrides.insert({std::string(p->key), *p});
3650
3723
  }
3651
3724
  }
@@ -3813,7 +3886,7 @@ struct llama_model_loader {
3813
3886
  ftype = (llama_ftype) (ftype | LLAMA_FTYPE_GUESSED);
3814
3887
 
3815
3888
  {
3816
- const int kid = lm_gguf_find_key(meta, "general.file_type");
3889
+ const int kid = lm_gguf_find_key(meta, "general.file_type"); // TODO: use LLM_KV
3817
3890
  if (kid >= 0) {
3818
3891
  ftype = (llama_ftype) lm_gguf_get_val_u32(meta, kid);
3819
3892
  }
@@ -3945,7 +4018,9 @@ struct llama_model_loader {
3945
4018
  throw std::runtime_error(format("%s is not a float32, int32 array", key.c_str()));
3946
4019
  }
3947
4020
 
3948
- LM_GGML_ASSERT(arr_info.length <= N_MAX);
4021
+ if (arr_info.length > N_MAX) {
4022
+ throw std::runtime_error(format("array length %u for key %s exceeds max %u", (uint32_t) arr_info.length, key.c_str(), (uint32_t) N_MAX));
4023
+ }
3949
4024
 
3950
4025
  std::copy((const T*)arr_info.data, (const T *)arr_info.data + arr_info.length, result.begin());
3951
4026
 
@@ -3981,8 +4056,6 @@ struct llama_model_loader {
3981
4056
  // get array of n <= N_MAX elements, or a single element repeated n times
3982
4057
  template<typename T, size_t N_MAX>
3983
4058
  bool get_key_or_arr(const std::string & key, std::array<T, N_MAX> & result, uint32_t n, const bool required = true) {
3984
- LM_GGML_ASSERT(n <= N_MAX);
3985
-
3986
4059
  const int kid = lm_gguf_find_key(meta, key.c_str());
3987
4060
 
3988
4061
  if (kid < 0) {
@@ -3992,6 +4065,10 @@ struct llama_model_loader {
3992
4065
  return false;
3993
4066
  }
3994
4067
 
4068
+ if (n > N_MAX) {
4069
+ throw std::runtime_error(format("n > N_MAX: %u > %u for key %s", (uint32_t) n, (uint32_t) N_MAX, key.c_str()));
4070
+ }
4071
+
3995
4072
  if (lm_gguf_get_kv_type(meta, kid) == LM_GGUF_TYPE_ARRAY) {
3996
4073
  struct GGUFMeta::ArrayInfo arr_info =
3997
4074
  GGUFMeta::GKV<GGUFMeta::ArrayInfo>::get_kv(meta, kid);
@@ -4461,40 +4538,36 @@ static std::string llama_model_ftype_name(llama_ftype ftype) {
4461
4538
  }
4462
4539
 
4463
4540
  switch (ftype) {
4464
- case LLAMA_FTYPE_ALL_F32: return "all F32";
4465
- case LLAMA_FTYPE_MOSTLY_F16: return "F16";
4466
- case LLAMA_FTYPE_MOSTLY_BF16: return "BF16";
4467
- case LLAMA_FTYPE_MOSTLY_Q4_0: return "Q4_0";
4468
- case LLAMA_FTYPE_MOSTLY_Q4_1: return "Q4_1";
4469
- case LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16:
4470
- return "Q4_1, some F16";
4471
- case LLAMA_FTYPE_MOSTLY_Q5_0: return "Q5_0";
4472
- case LLAMA_FTYPE_MOSTLY_Q5_1: return "Q5_1";
4473
- case LLAMA_FTYPE_MOSTLY_Q8_0: return "Q8_0";
4474
-
4475
- // K-quants
4476
- case LLAMA_FTYPE_MOSTLY_Q2_K: return "Q2_K - Medium";
4477
- case LLAMA_FTYPE_MOSTLY_Q2_K_S: return "Q2_K - Small";
4478
- case LLAMA_FTYPE_MOSTLY_Q3_K_S: return "Q3_K - Small";
4479
- case LLAMA_FTYPE_MOSTLY_Q3_K_M: return "Q3_K - Medium";
4480
- case LLAMA_FTYPE_MOSTLY_Q3_K_L: return "Q3_K - Large";
4481
- case LLAMA_FTYPE_MOSTLY_Q4_K_S: return "Q4_K - Small";
4482
- case LLAMA_FTYPE_MOSTLY_Q4_K_M: return "Q4_K - Medium";
4483
- case LLAMA_FTYPE_MOSTLY_Q5_K_S: return "Q5_K - Small";
4484
- case LLAMA_FTYPE_MOSTLY_Q5_K_M: return "Q5_K - Medium";
4485
- case LLAMA_FTYPE_MOSTLY_Q6_K: return "Q6_K";
4486
- case LLAMA_FTYPE_MOSTLY_IQ2_XXS:return "IQ2_XXS - 2.0625 bpw";
4487
- case LLAMA_FTYPE_MOSTLY_IQ2_XS: return "IQ2_XS - 2.3125 bpw";
4488
- case LLAMA_FTYPE_MOSTLY_IQ2_S: return "IQ2_S - 2.5 bpw";
4489
- case LLAMA_FTYPE_MOSTLY_IQ2_M: return "IQ2_M - 2.7 bpw";
4490
- case LLAMA_FTYPE_MOSTLY_IQ3_XS: return "IQ3_XS - 3.3 bpw";
4491
- case LLAMA_FTYPE_MOSTLY_IQ3_XXS:return "IQ3_XXS - 3.0625 bpw";
4492
- case LLAMA_FTYPE_MOSTLY_IQ1_S :return "IQ1_S - 1.5625 bpw";
4493
- case LLAMA_FTYPE_MOSTLY_IQ1_M :return "IQ1_M - 1.75 bpw";
4494
- case LLAMA_FTYPE_MOSTLY_IQ4_NL: return "IQ4_NL - 4.5 bpw";
4495
- case LLAMA_FTYPE_MOSTLY_IQ4_XS: return "IQ4_XS - 4.25 bpw";
4496
- case LLAMA_FTYPE_MOSTLY_IQ3_S: return "IQ3_S - 3.4375 bpw";
4497
- case LLAMA_FTYPE_MOSTLY_IQ3_M: return "IQ3_S mix - 3.66 bpw";
4541
+ case LLAMA_FTYPE_ALL_F32: return "all F32";
4542
+ case LLAMA_FTYPE_MOSTLY_F16: return "F16";
4543
+ case LLAMA_FTYPE_MOSTLY_BF16: return "BF16";
4544
+ case LLAMA_FTYPE_MOSTLY_Q4_0: return "Q4_0";
4545
+ case LLAMA_FTYPE_MOSTLY_Q4_1: return "Q4_1";
4546
+ case LLAMA_FTYPE_MOSTLY_Q5_0: return "Q5_0";
4547
+ case LLAMA_FTYPE_MOSTLY_Q5_1: return "Q5_1";
4548
+ case LLAMA_FTYPE_MOSTLY_Q8_0: return "Q8_0";
4549
+ case LLAMA_FTYPE_MOSTLY_Q2_K: return "Q2_K - Medium";
4550
+ case LLAMA_FTYPE_MOSTLY_Q2_K_S: return "Q2_K - Small";
4551
+ case LLAMA_FTYPE_MOSTLY_Q3_K_S: return "Q3_K - Small";
4552
+ case LLAMA_FTYPE_MOSTLY_Q3_K_M: return "Q3_K - Medium";
4553
+ case LLAMA_FTYPE_MOSTLY_Q3_K_L: return "Q3_K - Large";
4554
+ case LLAMA_FTYPE_MOSTLY_Q4_K_S: return "Q4_K - Small";
4555
+ case LLAMA_FTYPE_MOSTLY_Q4_K_M: return "Q4_K - Medium";
4556
+ case LLAMA_FTYPE_MOSTLY_Q5_K_S: return "Q5_K - Small";
4557
+ case LLAMA_FTYPE_MOSTLY_Q5_K_M: return "Q5_K - Medium";
4558
+ case LLAMA_FTYPE_MOSTLY_Q6_K: return "Q6_K";
4559
+ case LLAMA_FTYPE_MOSTLY_IQ2_XXS: return "IQ2_XXS - 2.0625 bpw";
4560
+ case LLAMA_FTYPE_MOSTLY_IQ2_XS: return "IQ2_XS - 2.3125 bpw";
4561
+ case LLAMA_FTYPE_MOSTLY_IQ2_S: return "IQ2_S - 2.5 bpw";
4562
+ case LLAMA_FTYPE_MOSTLY_IQ2_M: return "IQ2_M - 2.7 bpw";
4563
+ case LLAMA_FTYPE_MOSTLY_IQ3_XS: return "IQ3_XS - 3.3 bpw";
4564
+ case LLAMA_FTYPE_MOSTLY_IQ3_XXS: return "IQ3_XXS - 3.0625 bpw";
4565
+ case LLAMA_FTYPE_MOSTLY_IQ1_S: return "IQ1_S - 1.5625 bpw";
4566
+ case LLAMA_FTYPE_MOSTLY_IQ1_M: return "IQ1_M - 1.75 bpw";
4567
+ case LLAMA_FTYPE_MOSTLY_IQ4_NL: return "IQ4_NL - 4.5 bpw";
4568
+ case LLAMA_FTYPE_MOSTLY_IQ4_XS: return "IQ4_XS - 4.25 bpw";
4569
+ case LLAMA_FTYPE_MOSTLY_IQ3_S: return "IQ3_S - 3.4375 bpw";
4570
+ case LLAMA_FTYPE_MOSTLY_IQ3_M: return "IQ3_S mix - 3.66 bpw";
4498
4571
  case LLAMA_FTYPE_MOSTLY_Q4_0_4_4: return "Q4_0_4_4";
4499
4572
  case LLAMA_FTYPE_MOSTLY_Q4_0_4_8: return "Q4_0_4_8";
4500
4573
  case LLAMA_FTYPE_MOSTLY_Q4_0_8_8: return "Q4_0_8_8";
@@ -4945,7 +5018,7 @@ static void llm_load_hparams(
4945
5018
  {
4946
5019
  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
4947
5020
  switch (hparams.n_layer) {
4948
- case 42: model.type = e_model::MODEL_SMALL; break;
5021
+ case 42: model.type = e_model::MODEL_7B; break;
4949
5022
  default: model.type = e_model::MODEL_UNKNOWN;
4950
5023
  }
4951
5024
  } break;
@@ -5307,6 +5380,7 @@ static void llm_load_vocab(
5307
5380
  if (merges_keyidx == -1) {
5308
5381
  throw std::runtime_error("cannot find tokenizer merges in model file\n");
5309
5382
  }
5383
+
5310
5384
  const int n_merges = lm_gguf_get_arr_n(ctx, merges_keyidx);
5311
5385
  for (int i = 0; i < n_merges; i++) {
5312
5386
  const std::string word = lm_gguf_get_arr_str(ctx, merges_keyidx, i);
@@ -5345,16 +5419,6 @@ static void llm_load_vocab(
5345
5419
  vocab.special_cls_id = -1;
5346
5420
  vocab.special_mask_id = -1;
5347
5421
 
5348
- const int add_space_prefix_keyidx = lm_gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_ADD_PREFIX).c_str());
5349
- if (add_space_prefix_keyidx != -1) {
5350
- vocab.tokenizer_add_space_prefix = lm_gguf_get_val_bool(ctx, add_space_prefix_keyidx);
5351
- } // The default value of add_space_prefix is true.
5352
-
5353
- const int remove_extra_whitespaces_keyidx = lm_gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_REMOVE_EXTRA_WS).c_str());
5354
- if (remove_extra_whitespaces_keyidx != -1) {
5355
- vocab.tokenizer_remove_extra_whitespaces = lm_gguf_get_val_bool(ctx, remove_extra_whitespaces_keyidx);
5356
- } // The default value of remove_extra_whitespaces is false.
5357
-
5358
5422
  const int precompiled_charsmap_keyidx = lm_gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_PRECOMPILED_CHARSMAP).c_str());
5359
5423
  if (precompiled_charsmap_keyidx != -1) {
5360
5424
  size_t n_precompiled_charsmap = lm_gguf_get_arr_n(ctx, precompiled_charsmap_keyidx);
@@ -5462,6 +5526,19 @@ static void llm_load_vocab(
5462
5526
  } else if (
5463
5527
  tokenizer_pre == "jais") {
5464
5528
  vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_JAIS;
5529
+ } else if (
5530
+ tokenizer_pre == "tekken") {
5531
+ vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_TEKKEN;
5532
+ vocab.tokenizer_clean_spaces = false;
5533
+ vocab.tokenizer_ignore_merges = true;
5534
+ vocab.tokenizer_add_bos = true;
5535
+ } else if (
5536
+ tokenizer_pre == "smollm") {
5537
+ vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_SMOLLM;
5538
+ vocab.tokenizer_clean_spaces = false;
5539
+ } else if (
5540
+ tokenizer_pre == "codeshell") {
5541
+ vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_CODESHELL;
5465
5542
  } else {
5466
5543
  throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
5467
5544
  }
@@ -5485,10 +5562,8 @@ static void llm_load_vocab(
5485
5562
  vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
5486
5563
  }
5487
5564
 
5488
- const int add_space_prefix_keyidx = lm_gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_ADD_PREFIX).c_str());
5489
- if (add_space_prefix_keyidx != -1) {
5490
- vocab.tokenizer_add_space_prefix = lm_gguf_get_val_bool(ctx, add_space_prefix_keyidx);
5491
- }
5565
+ ml.get_key(LLM_KV_TOKENIZER_ADD_PREFIX, vocab.tokenizer_add_space_prefix, false);
5566
+ ml.get_key(LLM_KV_TOKENIZER_REMOVE_EXTRA_WS, vocab.tokenizer_remove_extra_whitespaces, false);
5492
5567
  }
5493
5568
 
5494
5569
  const int token_idx = lm_gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_LIST).c_str());
@@ -6069,10 +6144,10 @@ static bool llm_load_tensors(
6069
6144
 
6070
6145
  layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
6071
6146
 
6072
- layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
6073
- layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa});
6074
- layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
6075
- layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
6147
+ layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head});
6148
+ layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa});
6149
+ layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa});
6150
+ layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd});
6076
6151
 
6077
6152
  // optional bias tensors
6078
6153
  layer.bq = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
@@ -7820,6 +7895,58 @@ static void llm_build_kv_store(
7820
7895
  lm_ggml_build_forward_expand(graph, lm_ggml_cpy(ctx, v_cur, v_cache_view));
7821
7896
  }
7822
7897
 
7898
+ // do mat_mul, while optionally apply lora
7899
+ static struct lm_ggml_tensor * llm_build_lora_mm(
7900
+ struct llama_context & lctx,
7901
+ struct lm_ggml_context * ctx0,
7902
+ struct lm_ggml_tensor * w,
7903
+ struct lm_ggml_tensor * cur) {
7904
+ struct lm_ggml_tensor * res = lm_ggml_mul_mat(ctx0, w, cur);
7905
+ for (auto & it : lctx.lora_adapters) {
7906
+ struct llama_lora_weight * lora = it.first->get_weight(w);
7907
+ if (lora == nullptr) {
7908
+ continue;
7909
+ }
7910
+ const float alpha = it.first->alpha;
7911
+ const float rank = (float) lora->b->ne[0];
7912
+ const float scale = alpha ? it.second * alpha / rank : it.second;
7913
+ struct lm_ggml_tensor * ab_cur = lm_ggml_mul_mat(
7914
+ ctx0, lora->b,
7915
+ lm_ggml_mul_mat(ctx0, lora->a, cur)
7916
+ );
7917
+ ab_cur = lm_ggml_scale(ctx0, ab_cur, scale);
7918
+ res = lm_ggml_add(ctx0, res, ab_cur);
7919
+ }
7920
+ return res;
7921
+ }
7922
+
7923
+ // do mat_mul_id, while optionally apply lora
7924
+ static struct lm_ggml_tensor * llm_build_lora_mm_id(
7925
+ struct llama_context & lctx,
7926
+ struct lm_ggml_context * ctx0,
7927
+ struct lm_ggml_tensor * w, // struct lm_ggml_tensor * as
7928
+ struct lm_ggml_tensor * cur, // struct lm_ggml_tensor * b
7929
+ struct lm_ggml_tensor * ids) {
7930
+ struct lm_ggml_tensor * res = lm_ggml_mul_mat_id(ctx0, w, cur, ids);
7931
+ for (auto & it : lctx.lora_adapters) {
7932
+ struct llama_lora_weight * lora = it.first->get_weight(w);
7933
+ if (lora == nullptr) {
7934
+ continue;
7935
+ }
7936
+ const float alpha = it.first->alpha;
7937
+ const float rank = (float) lora->b->ne[0];
7938
+ const float scale = alpha ? it.second * alpha / rank : it.second;
7939
+ struct lm_ggml_tensor * ab_cur = lm_ggml_mul_mat_id(
7940
+ ctx0, lora->b,
7941
+ lm_ggml_mul_mat_id(ctx0, lora->a, cur, ids),
7942
+ ids
7943
+ );
7944
+ ab_cur = lm_ggml_scale(ctx0, ab_cur, scale);
7945
+ res = lm_ggml_add(ctx0, res, ab_cur);
7946
+ }
7947
+ return res;
7948
+ }
7949
+
7823
7950
  static struct lm_ggml_tensor * llm_build_norm(
7824
7951
  struct lm_ggml_context * ctx,
7825
7952
  struct lm_ggml_tensor * cur,
@@ -7854,6 +7981,7 @@ static struct lm_ggml_tensor * llm_build_norm(
7854
7981
 
7855
7982
  static struct lm_ggml_tensor * llm_build_ffn(
7856
7983
  struct lm_ggml_context * ctx,
7984
+ struct llama_context & lctx,
7857
7985
  struct lm_ggml_tensor * cur,
7858
7986
  struct lm_ggml_tensor * up,
7859
7987
  struct lm_ggml_tensor * up_b,
@@ -7869,7 +7997,7 @@ static struct lm_ggml_tensor * llm_build_ffn(
7869
7997
  llm_ffn_gate_type type_gate,
7870
7998
  const llm_build_cb & cb,
7871
7999
  int il) {
7872
- struct lm_ggml_tensor * tmp = up ? lm_ggml_mul_mat(ctx, up, cur) : cur;
8000
+ struct lm_ggml_tensor * tmp = up ? llm_build_lora_mm(lctx, ctx, up, cur) : cur;
7873
8001
  cb(tmp, "ffn_up", il);
7874
8002
 
7875
8003
  if (up_b) {
@@ -7886,12 +8014,12 @@ static struct lm_ggml_tensor * llm_build_ffn(
7886
8014
  switch (type_gate) {
7887
8015
  case LLM_FFN_SEQ:
7888
8016
  {
7889
- cur = lm_ggml_mul_mat(ctx, gate, tmp);
8017
+ cur = llm_build_lora_mm(lctx, ctx, gate, tmp);
7890
8018
  cb(cur, "ffn_gate", il);
7891
8019
  } break;
7892
8020
  case LLM_FFN_PAR:
7893
8021
  {
7894
- cur = lm_ggml_mul_mat(ctx, gate, cur);
8022
+ cur = llm_build_lora_mm(lctx, ctx, gate, cur);
7895
8023
  cb(cur, "ffn_gate", il);
7896
8024
  } break;
7897
8025
  }
@@ -7959,7 +8087,7 @@ static struct lm_ggml_tensor * llm_build_ffn(
7959
8087
  }
7960
8088
 
7961
8089
  if (down) {
7962
- cur = lm_ggml_mul_mat(ctx, down, cur);
8090
+ cur = llm_build_lora_mm(lctx, ctx, down, cur);
7963
8091
  }
7964
8092
 
7965
8093
  if (down_b) {
@@ -7980,6 +8108,7 @@ static struct lm_ggml_tensor * llm_build_ffn(
7980
8108
 
7981
8109
  static struct lm_ggml_tensor * llm_build_moe_ffn(
7982
8110
  struct lm_ggml_context * ctx,
8111
+ struct llama_context & lctx,
7983
8112
  struct lm_ggml_tensor * cur,
7984
8113
  struct lm_ggml_tensor * gate_inp,
7985
8114
  struct lm_ggml_tensor * up_exps,
@@ -7996,7 +8125,7 @@ static struct lm_ggml_tensor * llm_build_moe_ffn(
7996
8125
  int64_t n_embd = cur->ne[0];
7997
8126
  int64_t n_tokens = cur->ne[1];
7998
8127
 
7999
- lm_ggml_tensor * logits = lm_ggml_mul_mat(ctx, gate_inp, cur); // [n_expert, n_tokens]
8128
+ lm_ggml_tensor * logits = llm_build_lora_mm(lctx, ctx, gate_inp, cur); // [n_expert, n_tokens]
8000
8129
  cb(logits, "ffn_moe_logits", il);
8001
8130
 
8002
8131
  lm_ggml_tensor * probs = lm_ggml_soft_max(ctx, logits); // [n_expert, n_tokens]
@@ -8028,10 +8157,10 @@ static struct lm_ggml_tensor * llm_build_moe_ffn(
8028
8157
  }
8029
8158
 
8030
8159
  cur = lm_ggml_reshape_3d(ctx, cur, n_embd, 1, n_tokens);
8031
- lm_ggml_tensor * up = lm_ggml_mul_mat_id(ctx, up_exps, cur, selected_experts); // [n_ff, n_expert_used, n_tokens]
8160
+ lm_ggml_tensor * up = llm_build_lora_mm_id(lctx, ctx, up_exps, cur, selected_experts); // [n_ff, n_expert_used, n_tokens]
8032
8161
  cb(up, "ffn_moe_up", il);
8033
8162
 
8034
- lm_ggml_tensor * gate = lm_ggml_mul_mat_id(ctx, gate_exps, cur, selected_experts); // [n_ff, n_expert_used, n_tokens]
8163
+ lm_ggml_tensor * gate = llm_build_lora_mm_id(lctx, ctx, gate_exps, cur, selected_experts); // [n_ff, n_expert_used, n_tokens]
8035
8164
  cb(gate, "ffn_moe_gate", il);
8036
8165
 
8037
8166
  switch (type_op) {
@@ -8052,7 +8181,7 @@ static struct lm_ggml_tensor * llm_build_moe_ffn(
8052
8181
  lm_ggml_tensor * par = lm_ggml_mul(ctx, up, gate); // [n_ff, n_expert_used, n_tokens]
8053
8182
  cb(par, "ffn_moe_gate_par", il);
8054
8183
 
8055
- lm_ggml_tensor * experts = lm_ggml_mul_mat_id(ctx, down_exps, par, selected_experts); // [n_embd, n_expert_used, n_tokens]
8184
+ lm_ggml_tensor * experts = llm_build_lora_mm_id(lctx, ctx, down_exps, par, selected_experts); // [n_embd, n_expert_used, n_tokens]
8056
8185
  cb(experts, "ffn_moe_down", il);
8057
8186
 
8058
8187
  experts = lm_ggml_mul(ctx, experts, weights);
@@ -8080,9 +8209,7 @@ static struct lm_ggml_tensor * llm_build_moe_ffn(
8080
8209
 
8081
8210
  static struct lm_ggml_tensor * llm_build_kqv(
8082
8211
  struct lm_ggml_context * ctx,
8083
- const llama_model & model,
8084
- const llama_hparams & hparams,
8085
- const llama_cparams & cparams,
8212
+ struct llama_context & lctx,
8086
8213
  const llama_kv_cache & kv,
8087
8214
  struct lm_ggml_cgraph * graph,
8088
8215
  struct lm_ggml_tensor * wo,
@@ -8094,6 +8221,10 @@ static struct lm_ggml_tensor * llm_build_kqv(
8094
8221
  float kq_scale,
8095
8222
  const llm_build_cb & cb,
8096
8223
  int il) {
8224
+ const llama_model & model = lctx.model;
8225
+ const llama_hparams & hparams = lctx.model.hparams;
8226
+ const llama_cparams & cparams = lctx.cparams;
8227
+
8097
8228
  const int64_t n_ctx = cparams.n_ctx;
8098
8229
  const int64_t n_head = hparams.n_head(il);
8099
8230
  const int64_t n_head_kv = hparams.n_head_kv(il);
@@ -8192,7 +8323,7 @@ static struct lm_ggml_tensor * llm_build_kqv(
8192
8323
  lm_ggml_build_forward_expand(graph, cur);
8193
8324
 
8194
8325
  if (wo) {
8195
- cur = lm_ggml_mul_mat(ctx, wo, cur);
8326
+ cur = llm_build_lora_mm(lctx, ctx, wo, cur);
8196
8327
  }
8197
8328
 
8198
8329
  if (wo_b) {
@@ -8208,9 +8339,7 @@ static struct lm_ggml_tensor * llm_build_kqv(
8208
8339
 
8209
8340
  static struct lm_ggml_tensor * llm_build_kv(
8210
8341
  struct lm_ggml_context * ctx,
8211
- const llama_model & model,
8212
- const llama_hparams & hparams,
8213
- const llama_cparams & cparams,
8342
+ struct llama_context & lctx,
8214
8343
  const llama_kv_cache & kv,
8215
8344
  struct lm_ggml_cgraph * graph,
8216
8345
  struct lm_ggml_tensor * wo,
@@ -8225,6 +8354,8 @@ static struct lm_ggml_tensor * llm_build_kv(
8225
8354
  float kq_scale,
8226
8355
  const llm_build_cb & cb,
8227
8356
  int il) {
8357
+ const llama_hparams & hparams = lctx.model.hparams;
8358
+ const llama_cparams & cparams = lctx.cparams;
8228
8359
 
8229
8360
  // these nodes are added to the graph together so that they are not reordered
8230
8361
  // by doing so, the number of splits in the graph is reduced
@@ -8236,7 +8367,7 @@ static struct lm_ggml_tensor * llm_build_kv(
8236
8367
 
8237
8368
  struct lm_ggml_tensor * cur;
8238
8369
 
8239
- cur = llm_build_kqv(ctx, model, hparams, cparams, kv, graph, wo, wo_b,
8370
+ cur = llm_build_kqv(ctx, lctx, kv, graph, wo, wo_b,
8240
8371
  q_cur, kq_mask, n_tokens, n_kv, kq_scale, cb, il);
8241
8372
  cb(cur, "kqv_out", il);
8242
8373
 
@@ -8698,21 +8829,21 @@ struct llm_build_context {
8698
8829
  // self-attention
8699
8830
  {
8700
8831
  // compute Q and K and RoPE them
8701
- struct lm_ggml_tensor * Qcur = lm_ggml_mul_mat(ctx0, model.layers[il].wq, cur);
8832
+ struct lm_ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
8702
8833
  cb(Qcur, "Qcur", il);
8703
8834
  if (model.layers[il].bq) {
8704
8835
  Qcur = lm_ggml_add(ctx0, Qcur, model.layers[il].bq);
8705
8836
  cb(Qcur, "Qcur", il);
8706
8837
  }
8707
8838
 
8708
- struct lm_ggml_tensor * Kcur = lm_ggml_mul_mat(ctx0, model.layers[il].wk, cur);
8839
+ struct lm_ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
8709
8840
  cb(Kcur, "Kcur", il);
8710
8841
  if (model.layers[il].bk) {
8711
8842
  Kcur = lm_ggml_add(ctx0, Kcur, model.layers[il].bk);
8712
8843
  cb(Kcur, "Kcur", il);
8713
8844
  }
8714
8845
 
8715
- struct lm_ggml_tensor * Vcur = lm_ggml_mul_mat(ctx0, model.layers[il].wv, cur);
8846
+ struct lm_ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
8716
8847
  cb(Vcur, "Vcur", il);
8717
8848
  if (model.layers[il].bv) {
8718
8849
  Vcur = lm_ggml_add(ctx0, Vcur, model.layers[il].bv);
@@ -8733,7 +8864,7 @@ struct llm_build_context {
8733
8864
  );
8734
8865
  cb(Kcur, "Kcur", il);
8735
8866
 
8736
- cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
8867
+ cur = llm_build_kv(ctx0, lctx, kv_self, gf,
8737
8868
  model.layers[il].wo, model.layers[il].bo,
8738
8869
  Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
8739
8870
  }
@@ -8756,7 +8887,7 @@ struct llm_build_context {
8756
8887
  LLM_NORM_RMS, cb, il);
8757
8888
  cb(cur, "ffn_norm", il);
8758
8889
 
8759
- cur = llm_build_ffn(ctx0, cur,
8890
+ cur = llm_build_ffn(ctx0, lctx, cur,
8760
8891
  model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
8761
8892
  model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL,
8762
8893
  model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
@@ -8770,7 +8901,7 @@ struct llm_build_context {
8770
8901
  LLM_NORM_RMS, cb, il);
8771
8902
  cb(cur, "ffn_norm", il);
8772
8903
 
8773
- cur = llm_build_moe_ffn(ctx0, cur,
8904
+ cur = llm_build_moe_ffn(ctx0, lctx, cur,
8774
8905
  model.layers[il].ffn_gate_inp,
8775
8906
  model.layers[il].ffn_up_exps,
8776
8907
  model.layers[il].ffn_gate_exps,
@@ -8800,7 +8931,7 @@ struct llm_build_context {
8800
8931
  cb(cur, "result_norm", -1);
8801
8932
 
8802
8933
  // lm_head
8803
- cur = lm_ggml_mul_mat(ctx0, model.output, cur);
8934
+ cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
8804
8935
  cb(cur, "result_output", -1);
8805
8936
 
8806
8937
  lm_ggml_build_forward_expand(gf, cur);
@@ -8836,13 +8967,13 @@ struct llm_build_context {
8836
8967
 
8837
8968
  // self-attention
8838
8969
  {
8839
- struct lm_ggml_tensor * Qcur = lm_ggml_mul_mat(ctx0, model.layers[il].wq, cur);
8970
+ struct lm_ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
8840
8971
  cb(Qcur, "Qcur", il);
8841
8972
 
8842
- struct lm_ggml_tensor * Kcur = lm_ggml_mul_mat(ctx0, model.layers[il].wk, cur);
8973
+ struct lm_ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
8843
8974
  cb(Kcur, "Kcur", il);
8844
8975
 
8845
- struct lm_ggml_tensor * Vcur = lm_ggml_mul_mat(ctx0, model.layers[il].wv, cur);
8976
+ struct lm_ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
8846
8977
  cb(Vcur, "Vcur", il);
8847
8978
 
8848
8979
  switch (model.type) {
@@ -8868,7 +8999,7 @@ struct llm_build_context {
8868
8999
  cb(Qcur, "Qcur", il);
8869
9000
  cb(Kcur, "Kcur", il);
8870
9001
 
8871
- cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
9002
+ cur = llm_build_kv(ctx0, lctx, kv_self, gf,
8872
9003
  model.layers[il].wo, NULL,
8873
9004
  Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
8874
9005
  }
@@ -8890,7 +9021,7 @@ struct llm_build_context {
8890
9021
  LLM_NORM_RMS, cb, il);
8891
9022
  cb(cur, "ffn_norm", il);
8892
9023
 
8893
- cur = llm_build_ffn(ctx0, cur,
9024
+ cur = llm_build_ffn(ctx0, lctx, cur,
8894
9025
  model.layers[il].ffn_up, NULL, NULL,
8895
9026
  model.layers[il].ffn_gate, NULL, NULL,
8896
9027
  model.layers[il].ffn_down, NULL, NULL,
@@ -8915,7 +9046,7 @@ struct llm_build_context {
8915
9046
  cb(cur, "result_norm", -1);
8916
9047
 
8917
9048
  // lm_head
8918
- cur = lm_ggml_mul_mat(ctx0, model.output, cur);
9049
+ cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
8919
9050
  cb(cur, "result_output", -1);
8920
9051
 
8921
9052
  lm_ggml_build_forward_expand(gf, cur);
@@ -8951,13 +9082,13 @@ struct llm_build_context {
8951
9082
 
8952
9083
  // self-attention
8953
9084
  {
8954
- struct lm_ggml_tensor * Qcur = lm_ggml_mul_mat(ctx0, model.layers[il].wq, cur);
9085
+ struct lm_ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
8955
9086
  cb(Qcur, "Qcur", il);
8956
9087
 
8957
- struct lm_ggml_tensor * Kcur = lm_ggml_mul_mat(ctx0, model.layers[il].wk, cur);
9088
+ struct lm_ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
8958
9089
  cb(Kcur, "Kcur", il);
8959
9090
 
8960
- struct lm_ggml_tensor * Vcur = lm_ggml_mul_mat(ctx0, model.layers[il].wv, cur);
9091
+ struct lm_ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
8961
9092
  cb(Vcur, "Vcur", il);
8962
9093
 
8963
9094
  Qcur = lm_ggml_rope_ext(
@@ -8973,7 +9104,7 @@ struct llm_build_context {
8973
9104
  ext_factor, attn_factor, beta_fast, beta_slow
8974
9105
  );
8975
9106
  cb(Kcur, "Kcur", il);
8976
- cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
9107
+ cur = llm_build_kv(ctx0, lctx, kv_self, gf,
8977
9108
  model.layers[il].wo, NULL,
8978
9109
  Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
8979
9110
  }
@@ -8995,7 +9126,7 @@ struct llm_build_context {
8995
9126
  LLM_NORM_RMS, cb, il);
8996
9127
  cb(cur, "ffn_norm", il);
8997
9128
 
8998
- cur = llm_build_ffn(ctx0, cur,
9129
+ cur = llm_build_ffn(ctx0, lctx, cur,
8999
9130
  model.layers[il].ffn_up, NULL, NULL,
9000
9131
  model.layers[il].ffn_gate, NULL, NULL,
9001
9132
  model.layers[il].ffn_down, NULL, NULL,
@@ -9018,7 +9149,7 @@ struct llm_build_context {
9018
9149
  cb(cur, "result_norm", -1);
9019
9150
 
9020
9151
  // lm_head
9021
- cur = lm_ggml_mul_mat(ctx0, model.output, cur);
9152
+ cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
9022
9153
  cb(cur, "result_output", -1);
9023
9154
 
9024
9155
  lm_ggml_build_forward_expand(gf, cur);
@@ -9067,7 +9198,7 @@ struct llm_build_context {
9067
9198
  cur = attn_norm;
9068
9199
  }
9069
9200
 
9070
- cur = lm_ggml_mul_mat(ctx0, model.layers[il].wqkv, cur);
9201
+ cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wqkv, cur);
9071
9202
  cb(cur, "wqkv", il);
9072
9203
 
9073
9204
  struct lm_ggml_tensor * Qcur = lm_ggml_cont(ctx0, lm_ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
@@ -9094,7 +9225,7 @@ struct llm_build_context {
9094
9225
  );
9095
9226
  cb(Kcur, "Kcur", il);
9096
9227
 
9097
- cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
9228
+ cur = llm_build_kv(ctx0, lctx, kv_self, gf,
9098
9229
  model.layers[il].wo, NULL,
9099
9230
  Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
9100
9231
  }
@@ -9111,7 +9242,7 @@ struct llm_build_context {
9111
9242
 
9112
9243
  // feed forward
9113
9244
  {
9114
- cur = llm_build_ffn(ctx0, attn_norm, // !! use the attn norm, not the result
9245
+ cur = llm_build_ffn(ctx0, lctx, attn_norm, // !! use the attn norm, not the result
9115
9246
  model.layers[il].ffn_up, NULL, NULL,
9116
9247
  NULL, NULL, NULL,
9117
9248
  model.layers[il].ffn_down, NULL, NULL,
@@ -9138,7 +9269,7 @@ struct llm_build_context {
9138
9269
  LLM_NORM, cb, -1);
9139
9270
  cb(cur, "result_norm", -1);
9140
9271
 
9141
- cur = lm_ggml_mul_mat(ctx0, model.output, cur);
9272
+ cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
9142
9273
  cb(cur, "result_output", -1);
9143
9274
 
9144
9275
  lm_ggml_build_forward_expand(gf, cur);
@@ -9183,21 +9314,21 @@ struct llm_build_context {
9183
9314
  // self-attention
9184
9315
  {
9185
9316
  // compute Q and K and RoPE them
9186
- struct lm_ggml_tensor * Qcur = lm_ggml_mul_mat(ctx0, model.layers[il].wq, cur);
9317
+ struct lm_ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
9187
9318
  cb(Qcur, "Qcur", il);
9188
9319
  if (model.layers[il].bq) {
9189
9320
  Qcur = lm_ggml_add(ctx0, Qcur, model.layers[il].bq);
9190
9321
  cb(Qcur, "Qcur", il);
9191
9322
  }
9192
9323
 
9193
- struct lm_ggml_tensor * Kcur = lm_ggml_mul_mat(ctx0, model.layers[il].wk, cur);
9324
+ struct lm_ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
9194
9325
  cb(Kcur, "Kcur", il);
9195
9326
  if (model.layers[il].bk) {
9196
9327
  Kcur = lm_ggml_add(ctx0, Kcur, model.layers[il].bk);
9197
9328
  cb(Kcur, "Kcur", il);
9198
9329
  }
9199
9330
 
9200
- struct lm_ggml_tensor * Vcur = lm_ggml_mul_mat(ctx0, model.layers[il].wv, cur);
9331
+ struct lm_ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
9201
9332
  cb(Vcur, "Vcur", il);
9202
9333
  if (model.layers[il].bv) {
9203
9334
  Vcur = lm_ggml_add(ctx0, Vcur, model.layers[il].bv);
@@ -9218,7 +9349,7 @@ struct llm_build_context {
9218
9349
  );
9219
9350
  cb(Kcur, "Kcur", il);
9220
9351
 
9221
- cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
9352
+ cur = llm_build_kv(ctx0, lctx, kv_self, gf,
9222
9353
  model.layers[il].wo, model.layers[il].bo,
9223
9354
  Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f, cb, il);
9224
9355
  }
@@ -9250,7 +9381,7 @@ struct llm_build_context {
9250
9381
  LLM_NORM_RMS, cb, il);
9251
9382
  cb(cur, "ffn_norm", il);
9252
9383
 
9253
- cur = llm_build_moe_ffn(ctx0, cur,
9384
+ cur = llm_build_moe_ffn(ctx0, lctx, cur,
9254
9385
  model.layers[il].ffn_gate_inp,
9255
9386
  model.layers[il].ffn_up_exps,
9256
9387
  model.layers[il].ffn_gate_exps,
@@ -9289,7 +9420,7 @@ struct llm_build_context {
9289
9420
  cb(cur, "result_norm", -1);
9290
9421
 
9291
9422
  // lm_head
9292
- cur = lm_ggml_mul_mat(ctx0, model.output, cur);
9423
+ cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
9293
9424
 
9294
9425
  // Grok
9295
9426
  // multiply logits by output_multiplier_scale of 0.5773502691896257
@@ -9340,7 +9471,7 @@ struct llm_build_context {
9340
9471
  struct lm_ggml_tensor * Kcur = nullptr;
9341
9472
  struct lm_ggml_tensor * Vcur = nullptr;
9342
9473
 
9343
- cur = lm_ggml_mul_mat(ctx0, model.layers[il].wqkv, cur);
9474
+ cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wqkv, cur);
9344
9475
  cb(cur, "wqkv", il);
9345
9476
 
9346
9477
  cur = lm_ggml_clamp(ctx0, cur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv);
@@ -9368,7 +9499,7 @@ struct llm_build_context {
9368
9499
  );
9369
9500
  cb(Kcur, "Kcur", il);
9370
9501
 
9371
- cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
9502
+ cur = llm_build_kv(ctx0, lctx, kv_self, gf,
9372
9503
  model.layers[il].wo, NULL,
9373
9504
  Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
9374
9505
  }
@@ -9391,7 +9522,7 @@ struct llm_build_context {
9391
9522
  LLM_NORM, cb, il);
9392
9523
  cb(cur, "attn_out_norm", il);
9393
9524
 
9394
- cur = llm_build_moe_ffn(ctx0, cur,
9525
+ cur = llm_build_moe_ffn(ctx0, lctx, cur,
9395
9526
  model.layers[il].ffn_gate_inp,
9396
9527
  model.layers[il].ffn_up_exps,
9397
9528
  model.layers[il].ffn_gate_exps,
@@ -9420,7 +9551,7 @@ struct llm_build_context {
9420
9551
  cb(cur, "result_norm", -1);
9421
9552
 
9422
9553
  // lm_head
9423
- cur = lm_ggml_mul_mat(ctx0, model.output, cur);
9554
+ cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
9424
9555
 
9425
9556
  cb(cur, "result_output", -1);
9426
9557
 
@@ -9462,7 +9593,7 @@ struct llm_build_context {
9462
9593
 
9463
9594
  // self-attention
9464
9595
  {
9465
- cur = lm_ggml_mul_mat(ctx0, model.layers[il].wqkv, cur);
9596
+ cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wqkv, cur);
9466
9597
  cb(cur, "wqkv", il);
9467
9598
 
9468
9599
  cur = lm_ggml_add(ctx0, cur, model.layers[il].bqkv);
@@ -9478,7 +9609,7 @@ struct llm_build_context {
9478
9609
 
9479
9610
  Qcur = lm_ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
9480
9611
 
9481
- cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
9612
+ cur = llm_build_kv(ctx0, lctx, kv_self, gf,
9482
9613
  model.layers[il].wo, model.layers[il].bo,
9483
9614
  Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
9484
9615
  }
@@ -9502,7 +9633,7 @@ struct llm_build_context {
9502
9633
  LLM_NORM, cb, il);
9503
9634
  cb(cur, "ffn_norm", il);
9504
9635
 
9505
- cur = llm_build_ffn(ctx0, cur,
9636
+ cur = llm_build_ffn(ctx0, lctx, cur,
9506
9637
  model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
9507
9638
  NULL, NULL, NULL,
9508
9639
  model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
@@ -9525,7 +9656,7 @@ struct llm_build_context {
9525
9656
  LLM_NORM, cb, -1);
9526
9657
  cb(cur, "result_norm", -1);
9527
9658
 
9528
- cur = lm_ggml_mul_mat(ctx0, model.output, cur);
9659
+ cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
9529
9660
  cb(cur, "result_output", -1);
9530
9661
 
9531
9662
  lm_ggml_build_forward_expand(gf, cur);
@@ -9557,13 +9688,13 @@ struct llm_build_context {
9557
9688
 
9558
9689
  // self-attention
9559
9690
  {
9560
- struct lm_ggml_tensor * Qcur = lm_ggml_mul_mat(ctx0, model.layers[il].wq, cur);
9691
+ struct lm_ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
9561
9692
  cb(Qcur, "Qcur", il);
9562
9693
 
9563
- struct lm_ggml_tensor * Kcur = lm_ggml_mul_mat(ctx0, model.layers[il].wk, cur);
9694
+ struct lm_ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
9564
9695
  cb(Kcur, "Kcur", il);
9565
9696
 
9566
- struct lm_ggml_tensor * Vcur = lm_ggml_mul_mat(ctx0, model.layers[il].wv, cur);
9697
+ struct lm_ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
9567
9698
  cb(Vcur, "Vcur", il);
9568
9699
 
9569
9700
  Kcur = lm_ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
@@ -9572,7 +9703,7 @@ struct llm_build_context {
9572
9703
  Qcur = lm_ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
9573
9704
  cb(Qcur, "Qcur", il);
9574
9705
 
9575
- cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
9706
+ cur = llm_build_kv(ctx0, lctx, kv_self, gf,
9576
9707
  model.layers[il].wo, NULL,
9577
9708
  Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
9578
9709
  }
@@ -9594,7 +9725,7 @@ struct llm_build_context {
9594
9725
  LLM_NORM_RMS, cb, il);
9595
9726
  cb(cur, "ffn_norm", il);
9596
9727
 
9597
- cur = llm_build_ffn(ctx0, cur,
9728
+ cur = llm_build_ffn(ctx0, lctx, cur,
9598
9729
  model.layers[il].ffn_up, NULL, NULL,
9599
9730
  model.layers[il].ffn_gate, NULL, NULL,
9600
9731
  model.layers[il].ffn_down, NULL, NULL,
@@ -9619,7 +9750,7 @@ struct llm_build_context {
9619
9750
  cb(cur, "result_norm", -1);
9620
9751
 
9621
9752
  // lm_head
9622
- cur = lm_ggml_mul_mat(ctx0, model.output, cur);
9753
+ cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
9623
9754
  cb(cur, "result_output", -1);
9624
9755
 
9625
9756
  lm_ggml_build_forward_expand(gf, cur);
@@ -9671,7 +9802,7 @@ struct llm_build_context {
9671
9802
 
9672
9803
  // self-attention
9673
9804
  if (model.arch == LLM_ARCH_BERT || model.arch == LLM_ARCH_JINA_BERT_V2) {
9674
- Qcur = lm_ggml_add(ctx0, lm_ggml_mul_mat(ctx0, model.layers[il].wq, cur), model.layers[il].bq);
9805
+ Qcur = lm_ggml_add(ctx0, llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur), model.layers[il].bq);
9675
9806
  cb(Qcur, "Qcur", il);
9676
9807
 
9677
9808
  if (model.layers[il].attn_q_norm) {
@@ -9681,7 +9812,7 @@ struct llm_build_context {
9681
9812
  LLM_NORM, cb, il);
9682
9813
  }
9683
9814
 
9684
- Kcur = lm_ggml_add(ctx0, lm_ggml_mul_mat(ctx0, model.layers[il].wk, cur), model.layers[il].bk);
9815
+ Kcur = lm_ggml_add(ctx0, llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur), model.layers[il].bk);
9685
9816
  cb(Kcur, "Kcur", il);
9686
9817
 
9687
9818
  if (model.layers[il].attn_k_norm) {
@@ -9690,14 +9821,14 @@ struct llm_build_context {
9690
9821
  model.layers[il].attn_k_norm_b,
9691
9822
  LLM_NORM, cb, il);
9692
9823
  }
9693
- Vcur = lm_ggml_add(ctx0, lm_ggml_mul_mat(ctx0, model.layers[il].wv, cur), model.layers[il].bv);
9824
+ Vcur = lm_ggml_add(ctx0, llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur), model.layers[il].bv);
9694
9825
  cb(Vcur, "Vcur", il);
9695
9826
 
9696
9827
  Qcur = lm_ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
9697
9828
  Kcur = lm_ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
9698
9829
  } else {
9699
9830
  // compute Q and K and RoPE them
9700
- cur = lm_ggml_mul_mat(ctx0, model.layers[il].wqkv, cur);
9831
+ cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wqkv, cur);
9701
9832
  cb(cur, "wqkv", il);
9702
9833
 
9703
9834
  Qcur = lm_ggml_cont(ctx0, lm_ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
@@ -9746,7 +9877,7 @@ struct llm_build_context {
9746
9877
 
9747
9878
  lm_ggml_build_forward_expand(gf, cur);
9748
9879
 
9749
- cur = lm_ggml_mul_mat(ctx0, model.layers[il].wo, cur);
9880
+ cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wo, cur);
9750
9881
  if (model.layers[il].bo) {
9751
9882
  cb(cur, "kqv_wo", il);
9752
9883
  }
@@ -9779,21 +9910,21 @@ struct llm_build_context {
9779
9910
 
9780
9911
  // feed-forward network
9781
9912
  if (model.arch == LLM_ARCH_BERT) {
9782
- cur = llm_build_ffn(ctx0, cur,
9913
+ cur = llm_build_ffn(ctx0, lctx, cur,
9783
9914
  model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
9784
9915
  NULL, NULL, NULL,
9785
9916
  model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
9786
9917
  NULL,
9787
9918
  LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
9788
9919
  } else if (model.arch == LLM_ARCH_JINA_BERT_V2) {
9789
- cur = llm_build_ffn(ctx0, cur,
9920
+ cur = llm_build_ffn(ctx0, lctx, cur,
9790
9921
  model.layers[il].ffn_up, NULL, NULL,
9791
9922
  model.layers[il].ffn_gate, NULL, NULL,
9792
9923
  model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
9793
9924
  NULL,
9794
9925
  LLM_FFN_GELU, LLM_FFN_PAR, cb, il);
9795
9926
  } else {
9796
- cur = llm_build_ffn(ctx0, cur,
9927
+ cur = llm_build_ffn(ctx0, lctx, cur,
9797
9928
  model.layers[il].ffn_up, NULL, NULL,
9798
9929
  model.layers[il].ffn_gate, NULL, NULL,
9799
9930
  model.layers[il].ffn_down, NULL, NULL,
@@ -9851,7 +9982,7 @@ struct llm_build_context {
9851
9982
 
9852
9983
  // self-attention
9853
9984
  {
9854
- cur = lm_ggml_mul_mat(ctx0, model.layers[il].wqkv, cur);
9985
+ cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wqkv, cur);
9855
9986
  cb(cur, "wqkv", il);
9856
9987
 
9857
9988
  cur = lm_ggml_add(ctx0, cur, model.layers[il].bqkv);
@@ -9867,7 +9998,7 @@ struct llm_build_context {
9867
9998
 
9868
9999
  Qcur = lm_ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
9869
10000
 
9870
- cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
10001
+ cur = llm_build_kv(ctx0, lctx, kv_self, gf,
9871
10002
  model.layers[il].wo, model.layers[il].bo,
9872
10003
  Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
9873
10004
  }
@@ -9891,7 +10022,7 @@ struct llm_build_context {
9891
10022
  LLM_NORM, cb, il);
9892
10023
  cb(cur, "ffn_norm", il);
9893
10024
 
9894
- cur = llm_build_ffn(ctx0, cur,
10025
+ cur = llm_build_ffn(ctx0, lctx, cur,
9895
10026
  model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
9896
10027
  NULL, NULL, NULL,
9897
10028
  model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
@@ -9914,7 +10045,7 @@ struct llm_build_context {
9914
10045
  LLM_NORM, cb, -1);
9915
10046
  cb(cur, "result_norm", -1);
9916
10047
 
9917
- cur = lm_ggml_mul_mat(ctx0, model.output, cur);
10048
+ cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
9918
10049
  cb(cur, "result_output", -1);
9919
10050
 
9920
10051
  lm_ggml_build_forward_expand(gf, cur);
@@ -9961,7 +10092,7 @@ struct llm_build_context {
9961
10092
  {
9962
10093
  cur = attn_norm;
9963
10094
 
9964
- cur = lm_ggml_mul_mat(ctx0, model.layers[il].wqkv, cur);
10095
+ cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wqkv, cur);
9965
10096
  cb(cur, "wqkv", il);
9966
10097
 
9967
10098
  if (model.layers[il].bqkv){
@@ -9999,13 +10130,13 @@ struct llm_build_context {
9999
10130
  Qcur = lm_ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
10000
10131
  Kcur = lm_ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
10001
10132
 
10002
- cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
10133
+ cur = llm_build_kv(ctx0, lctx, kv_self, gf,
10003
10134
  model.layers[il].wo, model.layers[il].bo,
10004
10135
  Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
10005
10136
  } else {
10006
10137
  Qcur = lm_ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
10007
10138
 
10008
- cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
10139
+ cur = llm_build_kv(ctx0, lctx, kv_self, gf,
10009
10140
  model.layers[il].wo, model.layers[il].bo,
10010
10141
  Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
10011
10142
  }
@@ -10029,7 +10160,7 @@ struct llm_build_context {
10029
10160
  model.layers[il].ffn_norm_b,
10030
10161
  LLM_NORM, cb, il);
10031
10162
  cb(cur, "ffn_norm", il);
10032
- cur = llm_build_ffn(ctx0, cur,
10163
+ cur = llm_build_ffn(ctx0, lctx, cur,
10033
10164
  model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
10034
10165
  NULL, NULL, NULL,
10035
10166
  model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
@@ -10054,7 +10185,7 @@ struct llm_build_context {
10054
10185
  LLM_NORM, cb, -1);
10055
10186
  cb(cur, "result_norm", -1);
10056
10187
 
10057
- cur = lm_ggml_mul_mat(ctx0, model.output, cur);
10188
+ cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
10058
10189
  cb(cur, "result_output", -1);
10059
10190
 
10060
10191
  lm_ggml_build_forward_expand(gf, cur);
@@ -10094,21 +10225,21 @@ struct llm_build_context {
10094
10225
  // self-attention
10095
10226
  {
10096
10227
  // compute Q and K and RoPE them
10097
- struct lm_ggml_tensor * Qcur = lm_ggml_mul_mat(ctx0, model.layers[il].wq, cur);
10228
+ struct lm_ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
10098
10229
  cb(Qcur, "Qcur", il);
10099
10230
  if (model.layers[il].bq) {
10100
10231
  Qcur = lm_ggml_add(ctx0, Qcur, model.layers[il].bq);
10101
10232
  cb(Qcur, "Qcur", il);
10102
10233
  }
10103
10234
 
10104
- struct lm_ggml_tensor * Kcur = lm_ggml_mul_mat(ctx0, model.layers[il].wk, cur);
10235
+ struct lm_ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
10105
10236
  cb(Kcur, "Kcur", il);
10106
10237
  if (model.layers[il].bk) {
10107
10238
  Kcur = lm_ggml_add(ctx0, Kcur, model.layers[il].bk);
10108
10239
  cb(Kcur, "Kcur", il);
10109
10240
  }
10110
10241
 
10111
- struct lm_ggml_tensor * Vcur = lm_ggml_mul_mat(ctx0, model.layers[il].wv, cur);
10242
+ struct lm_ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
10112
10243
  cb(Vcur, "Vcur", il);
10113
10244
  if (model.layers[il].bv) {
10114
10245
  Vcur = lm_ggml_add(ctx0, Vcur, model.layers[il].bv);
@@ -10150,7 +10281,7 @@ struct llm_build_context {
10150
10281
  );
10151
10282
  cb(Kcur, "Kcur", il);
10152
10283
 
10153
- cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
10284
+ cur = llm_build_kv(ctx0, lctx, kv_self, gf,
10154
10285
  model.layers[il].wo, NULL,
10155
10286
  Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
10156
10287
  }
@@ -10178,7 +10309,7 @@ struct llm_build_context {
10178
10309
  // parallel residual
10179
10310
  cur = inpSA;
10180
10311
  }
10181
- cur = llm_build_ffn(ctx0, cur,
10312
+ cur = llm_build_ffn(ctx0, lctx, cur,
10182
10313
  model.layers[il].ffn_up, NULL, NULL,
10183
10314
  model.layers[il].ffn_gate, NULL, NULL,
10184
10315
  model.layers[il].ffn_down, NULL, NULL,
@@ -10204,7 +10335,7 @@ struct llm_build_context {
10204
10335
  cb(cur, "result_norm", -1);
10205
10336
 
10206
10337
  // lm_head
10207
- cur = lm_ggml_mul_mat(ctx0, model.output, cur);
10338
+ cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
10208
10339
  cb(cur, "result_output", -1);
10209
10340
 
10210
10341
  lm_ggml_build_forward_expand(gf, cur);
@@ -10239,7 +10370,7 @@ struct llm_build_context {
10239
10370
 
10240
10371
  // self-attention
10241
10372
  {
10242
- cur = lm_ggml_mul_mat(ctx0, model.layers[il].wqkv, cur);
10373
+ cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wqkv, cur);
10243
10374
  cb(cur, "wqkv", il);
10244
10375
 
10245
10376
  cur = lm_ggml_add(ctx0, cur, model.layers[il].bqkv);
@@ -10269,7 +10400,7 @@ struct llm_build_context {
10269
10400
  );
10270
10401
  cb(Kcur, "Kcur", il);
10271
10402
 
10272
- cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
10403
+ cur = llm_build_kv(ctx0, lctx, kv_self, gf,
10273
10404
  model.layers[il].wo, NULL,
10274
10405
  Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
10275
10406
  }
@@ -10291,7 +10422,7 @@ struct llm_build_context {
10291
10422
  LLM_NORM_RMS, cb, il);
10292
10423
  cb(cur, "ffn_norm", il);
10293
10424
 
10294
- cur = llm_build_ffn(ctx0, cur,
10425
+ cur = llm_build_ffn(ctx0, lctx, cur,
10295
10426
  model.layers[il].ffn_up, NULL, NULL,
10296
10427
  model.layers[il].ffn_gate, NULL, NULL,
10297
10428
  model.layers[il].ffn_down, NULL, NULL,
@@ -10316,7 +10447,7 @@ struct llm_build_context {
10316
10447
  cb(cur, "result_norm", -1);
10317
10448
 
10318
10449
  // lm_head
10319
- cur = lm_ggml_mul_mat(ctx0, model.output, cur);
10450
+ cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
10320
10451
  cb(cur, "result_output", -1);
10321
10452
 
10322
10453
  lm_ggml_build_forward_expand(gf, cur);
@@ -10354,17 +10485,17 @@ struct llm_build_context {
10354
10485
  // self-attention
10355
10486
  {
10356
10487
  // compute Q and K and RoPE them
10357
- struct lm_ggml_tensor * Qcur = lm_ggml_mul_mat(ctx0, model.layers[il].wq, cur);
10488
+ struct lm_ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
10358
10489
  cb(Qcur, "Qcur", il);
10359
10490
  Qcur = lm_ggml_add(ctx0, Qcur, model.layers[il].bq);
10360
10491
  cb(Qcur, "Qcur", il);
10361
10492
 
10362
- struct lm_ggml_tensor * Kcur = lm_ggml_mul_mat(ctx0, model.layers[il].wk, cur);
10493
+ struct lm_ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
10363
10494
  cb(Kcur, "Kcur", il);
10364
10495
  Kcur = lm_ggml_add(ctx0, Kcur, model.layers[il].bk);
10365
10496
  cb(Kcur, "Kcur", il);
10366
10497
 
10367
- struct lm_ggml_tensor * Vcur = lm_ggml_mul_mat(ctx0, model.layers[il].wv, cur);
10498
+ struct lm_ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
10368
10499
  cb(Vcur, "Vcur", il);
10369
10500
  Vcur = lm_ggml_add(ctx0, Vcur, model.layers[il].bv);
10370
10501
  cb(Vcur, "Vcur", il);
@@ -10383,7 +10514,7 @@ struct llm_build_context {
10383
10514
  );
10384
10515
  cb(Kcur, "Kcur", il);
10385
10516
 
10386
- cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
10517
+ cur = llm_build_kv(ctx0, lctx, kv_self, gf,
10387
10518
  model.layers[il].wo, model.layers[il].bo,
10388
10519
  Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
10389
10520
  }
@@ -10404,7 +10535,7 @@ struct llm_build_context {
10404
10535
  LLM_NORM_RMS, cb, il);
10405
10536
  cb(cur, "ffn_norm", il);
10406
10537
 
10407
- cur = llm_build_ffn(ctx0, cur,
10538
+ cur = llm_build_ffn(ctx0, lctx, cur,
10408
10539
  model.layers[il].ffn_up, NULL, NULL,
10409
10540
  model.layers[il].ffn_gate, NULL, NULL,
10410
10541
  model.layers[il].ffn_down, NULL, NULL,
@@ -10428,7 +10559,7 @@ struct llm_build_context {
10428
10559
  cb(cur, "result_norm", -1);
10429
10560
 
10430
10561
  // lm_head
10431
- cur = lm_ggml_mul_mat(ctx0, model.output, cur);
10562
+ cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
10432
10563
  cb(cur, "result_output", -1);
10433
10564
 
10434
10565
  lm_ggml_build_forward_expand(gf, cur);
@@ -10469,17 +10600,17 @@ struct llm_build_context {
10469
10600
  // self_attention
10470
10601
  {
10471
10602
  // compute Q and K and RoPE them
10472
- struct lm_ggml_tensor * Qcur = lm_ggml_mul_mat(ctx0, model.layers[il].wq, cur);
10603
+ struct lm_ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
10473
10604
  cb(Qcur, "Qcur", il);
10474
10605
  Qcur = lm_ggml_add(ctx0, Qcur, model.layers[il].bq);
10475
10606
  cb(Qcur, "Qcur", il);
10476
10607
 
10477
- struct lm_ggml_tensor * Kcur = lm_ggml_mul_mat(ctx0, model.layers[il].wk, cur);
10608
+ struct lm_ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
10478
10609
  cb(Kcur, "Kcur", il);
10479
10610
  Kcur = lm_ggml_add(ctx0, Kcur, model.layers[il].bk);
10480
10611
  cb(Kcur, "Kcur", il);
10481
10612
 
10482
- struct lm_ggml_tensor * Vcur = lm_ggml_mul_mat(ctx0, model.layers[il].wv, cur);
10613
+ struct lm_ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
10483
10614
  cb(Vcur, "Vcur", il);
10484
10615
  Vcur = lm_ggml_add(ctx0, Vcur, model.layers[il].bv);
10485
10616
  cb(Vcur, "Vcur", il);
@@ -10498,7 +10629,7 @@ struct llm_build_context {
10498
10629
  );
10499
10630
  cb(Kcur, "Kcur", il);
10500
10631
 
10501
- cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
10632
+ cur = llm_build_kv(ctx0, lctx, kv_self, gf,
10502
10633
  model.layers[il].wo, model.layers[il].bo,
10503
10634
  Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
10504
10635
  }
@@ -10521,7 +10652,7 @@ struct llm_build_context {
10521
10652
  cb(cur, "ffn_norm", il);
10522
10653
 
10523
10654
  lm_ggml_tensor * moe_out =
10524
- llm_build_moe_ffn(ctx0, cur,
10655
+ llm_build_moe_ffn(ctx0, lctx, cur,
10525
10656
  model.layers[il].ffn_gate_inp,
10526
10657
  model.layers[il].ffn_up_exps,
10527
10658
  model.layers[il].ffn_gate_exps,
@@ -10534,14 +10665,14 @@ struct llm_build_context {
10534
10665
 
10535
10666
  // FFN shared expert
10536
10667
  {
10537
- lm_ggml_tensor * cur_gate_inp = lm_ggml_mul_mat(ctx0, model.layers[il].ffn_gate_inp_shexp, cur);
10668
+ lm_ggml_tensor * cur_gate_inp = llm_build_lora_mm(lctx, ctx0, model.layers[il].ffn_gate_inp_shexp, cur);
10538
10669
  cb(cur_gate_inp, "ffn_shexp_gate_inp", il);
10539
10670
 
10540
10671
  // sigmoid
10541
10672
  lm_ggml_tensor * cur_gate = lm_ggml_div(ctx0, lm_ggml_silu(ctx0, cur_gate_inp), cur_gate_inp);
10542
10673
  cb(cur_gate, "ffn_shexp_gate", il);
10543
10674
 
10544
- lm_ggml_tensor * cur_ffn = llm_build_ffn(ctx0, cur,
10675
+ lm_ggml_tensor * cur_ffn = llm_build_ffn(ctx0, lctx, cur,
10545
10676
  model.layers[il].ffn_up_shexp, NULL, NULL,
10546
10677
  model.layers[il].ffn_gate_shexp, NULL, NULL,
10547
10678
  model.layers[il].ffn_down_shexp, NULL, NULL,
@@ -10574,7 +10705,7 @@ struct llm_build_context {
10574
10705
  cb(cur, "result_norm", -1);
10575
10706
 
10576
10707
  // lm_head
10577
- cur = lm_ggml_mul_mat(ctx0, model.output, cur);
10708
+ cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
10578
10709
  cb(cur, "result_output", -1);
10579
10710
 
10580
10711
  lm_ggml_build_forward_expand(gf, cur);
@@ -10616,7 +10747,7 @@ struct llm_build_context {
10616
10747
  struct lm_ggml_tensor * Vcur = nullptr;
10617
10748
 
10618
10749
  if (model.layers[il].wqkv) {
10619
- cur = lm_ggml_mul_mat(ctx0, model.layers[il].wqkv, attn_norm_output);
10750
+ cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wqkv, attn_norm_output);
10620
10751
  cb(cur, "wqkv", il);
10621
10752
 
10622
10753
  cur = lm_ggml_add(ctx0, cur, model.layers[il].bqkv);
@@ -10626,9 +10757,9 @@ struct llm_build_context {
10626
10757
  Kcur = lm_ggml_cont(ctx0, lm_ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
10627
10758
  Vcur = lm_ggml_cont(ctx0, lm_ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
10628
10759
  } else {
10629
- Qcur = lm_ggml_add(ctx0, lm_ggml_mul_mat(ctx0, model.layers[il].wq, attn_norm_output), model.layers[il].bq);
10630
- Kcur = lm_ggml_add(ctx0, lm_ggml_mul_mat(ctx0, model.layers[il].wk, attn_norm_output), model.layers[il].bk);
10631
- Vcur = lm_ggml_add(ctx0, lm_ggml_mul_mat(ctx0, model.layers[il].wv, attn_norm_output), model.layers[il].bv);
10760
+ Qcur = lm_ggml_add(ctx0, llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, attn_norm_output), model.layers[il].bq);
10761
+ Kcur = lm_ggml_add(ctx0, llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, attn_norm_output), model.layers[il].bk);
10762
+ Vcur = lm_ggml_add(ctx0, llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, attn_norm_output), model.layers[il].bv);
10632
10763
  }
10633
10764
 
10634
10765
  cb(Qcur, "Qcur", il);
@@ -10655,7 +10786,7 @@ struct llm_build_context {
10655
10786
  );
10656
10787
  cb(Kcur, "Kcur", il);
10657
10788
 
10658
- cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
10789
+ cur = llm_build_kv(ctx0, lctx, kv_self, gf,
10659
10790
  model.layers[il].wo, model.layers[il].bo,
10660
10791
  Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f, cb, il);
10661
10792
  }
@@ -10670,7 +10801,7 @@ struct llm_build_context {
10670
10801
 
10671
10802
  // FF
10672
10803
  {
10673
- ffn_output = llm_build_ffn(ctx0, attn_norm_output,
10804
+ ffn_output = llm_build_ffn(ctx0, lctx, attn_norm_output,
10674
10805
  model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
10675
10806
  NULL, NULL, NULL,
10676
10807
  model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
@@ -10694,7 +10825,7 @@ struct llm_build_context {
10694
10825
  LLM_NORM, cb, -1);
10695
10826
  cb(cur, "result_norm", -1);
10696
10827
 
10697
- cur = lm_ggml_mul_mat(ctx0, model.output, cur);
10828
+ cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
10698
10829
  cb(cur, "result_output_no_bias", -1);
10699
10830
 
10700
10831
  cur = lm_ggml_add(ctx0, cur, model.output_b);
@@ -10740,7 +10871,7 @@ struct llm_build_context {
10740
10871
  struct lm_ggml_tensor * Vcur = nullptr;
10741
10872
 
10742
10873
  if (model.layers[il].wqkv) {
10743
- cur = lm_ggml_mul_mat(ctx0, model.layers[il].wqkv, attn_norm_output);
10874
+ cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wqkv, attn_norm_output);
10744
10875
  cb(cur, "wqkv", il);
10745
10876
 
10746
10877
  Qcur = lm_ggml_cont(ctx0, lm_ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0 * sizeof(float) * (n_embd)));
@@ -10748,9 +10879,9 @@ struct llm_build_context {
10748
10879
  Vcur = lm_ggml_cont(ctx0, lm_ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1 * sizeof(float) * (n_embd + n_embd_gqa)));
10749
10880
  }
10750
10881
  else {
10751
- Qcur = lm_ggml_add(ctx0, lm_ggml_mul_mat(ctx0, model.layers[il].wq, attn_norm_output), model.layers[il].bq);
10752
- Kcur = lm_ggml_add(ctx0, lm_ggml_mul_mat(ctx0, model.layers[il].wk, attn_norm_output), model.layers[il].bk);
10753
- Vcur = lm_ggml_add(ctx0, lm_ggml_mul_mat(ctx0, model.layers[il].wv, attn_norm_output), model.layers[il].bv);
10882
+ Qcur = lm_ggml_add(ctx0, llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, attn_norm_output), model.layers[il].bq);
10883
+ Kcur = lm_ggml_add(ctx0, llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, attn_norm_output), model.layers[il].bk);
10884
+ Vcur = lm_ggml_add(ctx0, llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, attn_norm_output), model.layers[il].bv);
10754
10885
  }
10755
10886
 
10756
10887
  cb(Qcur, "Qcur", il);
@@ -10775,7 +10906,7 @@ struct llm_build_context {
10775
10906
  );
10776
10907
  cb(Kcur, "Kcur", il);
10777
10908
 
10778
- cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
10909
+ cur = llm_build_kv(ctx0, lctx, kv_self, gf,
10779
10910
  model.layers[il].wo, model.layers[il].bo,
10780
10911
  Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f, cb, il);
10781
10912
  }
@@ -10799,7 +10930,7 @@ struct llm_build_context {
10799
10930
  // special-case: the up and gate tensors are merged into a single tensor
10800
10931
  // TOOD: support into llm_build_ffn
10801
10932
  {
10802
- cur = llm_build_ffn(ctx0, cur,
10933
+ cur = llm_build_ffn(ctx0, lctx, cur,
10803
10934
  model.layers[il].ffn_up, NULL, NULL,
10804
10935
  NULL, NULL, NULL,
10805
10936
  model.layers[il].ffn_down, NULL, NULL,
@@ -10822,7 +10953,7 @@ struct llm_build_context {
10822
10953
  LLM_NORM_RMS, cb, -1);
10823
10954
  cb(cur, "result_norm", -1);
10824
10955
 
10825
- cur = lm_ggml_mul_mat(ctx0, model.output, cur);
10956
+ cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
10826
10957
  cb(cur, "result_output", -1);
10827
10958
 
10828
10959
  lm_ggml_build_forward_expand(gf, cur);
@@ -10862,13 +10993,13 @@ struct llm_build_context {
10862
10993
  // self-attention
10863
10994
  {
10864
10995
  // compute Q and K and RoPE them
10865
- struct lm_ggml_tensor * Qcur = lm_ggml_mul_mat(ctx0, model.layers[il].wq, cur);
10996
+ struct lm_ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
10866
10997
  cb(Qcur, "Qcur", il);
10867
10998
 
10868
- struct lm_ggml_tensor * Kcur = lm_ggml_mul_mat(ctx0, model.layers[il].wk, cur);
10999
+ struct lm_ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
10869
11000
  cb(Kcur, "Kcur", il);
10870
11001
 
10871
- struct lm_ggml_tensor * Vcur = lm_ggml_mul_mat(ctx0, model.layers[il].wv, cur);
11002
+ struct lm_ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
10872
11003
  cb(Vcur, "Vcur", il);
10873
11004
 
10874
11005
  Qcur = lm_ggml_rope_ext(
@@ -10883,7 +11014,7 @@ struct llm_build_context {
10883
11014
  ext_factor, attn_factor, beta_fast, beta_slow);
10884
11015
  cb(Kcur, "Kcur", il);
10885
11016
 
10886
- cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
11017
+ cur = llm_build_kv(ctx0, lctx, kv_self, gf,
10887
11018
  model.layers[il].wo, NULL,
10888
11019
  Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
10889
11020
  }
@@ -10901,7 +11032,7 @@ struct llm_build_context {
10901
11032
 
10902
11033
  // feed-forward network
10903
11034
  {
10904
- cur = llm_build_ffn(ctx0, cur,
11035
+ cur = llm_build_ffn(ctx0, lctx, cur,
10905
11036
  model.layers[il].ffn_up, NULL, NULL,
10906
11037
  model.layers[il].ffn_gate, NULL, NULL,
10907
11038
  model.layers[il].ffn_down, NULL, NULL,
@@ -10927,7 +11058,7 @@ struct llm_build_context {
10927
11058
  cb(cur, "result_norm", -1);
10928
11059
 
10929
11060
  // lm_head
10930
- cur = lm_ggml_mul_mat(ctx0, model.output, cur);
11061
+ cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
10931
11062
  cb(cur, "result_output", -1);
10932
11063
 
10933
11064
  lm_ggml_build_forward_expand(gf, cur);
@@ -10969,7 +11100,7 @@ struct llm_build_context {
10969
11100
 
10970
11101
  // self-attention
10971
11102
  {
10972
- cur = lm_ggml_mul_mat(ctx0, model.layers[il].wqkv, cur);
11103
+ cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wqkv, cur);
10973
11104
  cb(cur, "wqkv", il);
10974
11105
 
10975
11106
  cur = lm_ggml_add(ctx0, cur, model.layers[il].bqkv);
@@ -10985,7 +11116,7 @@ struct llm_build_context {
10985
11116
 
10986
11117
  Qcur = lm_ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
10987
11118
 
10988
- cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
11119
+ cur = llm_build_kv(ctx0, lctx, kv_self, gf,
10989
11120
  model.layers[il].wo, model.layers[il].bo,
10990
11121
  Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
10991
11122
  }
@@ -11009,7 +11140,7 @@ struct llm_build_context {
11009
11140
  LLM_NORM, cb, il);
11010
11141
  cb(cur, "ffn_norm", il);
11011
11142
 
11012
- cur = llm_build_ffn(ctx0, cur,
11143
+ cur = llm_build_ffn(ctx0, lctx, cur,
11013
11144
  model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
11014
11145
  NULL, NULL, NULL,
11015
11146
  model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
@@ -11032,7 +11163,7 @@ struct llm_build_context {
11032
11163
  LLM_NORM, cb, -1);
11033
11164
  cb(cur, "result_norm", -1);
11034
11165
 
11035
- cur = lm_ggml_mul_mat(ctx0, model.output, cur);
11166
+ cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
11036
11167
  cb(cur, "result_output", -1);
11037
11168
 
11038
11169
  lm_ggml_build_forward_expand(gf, cur);
@@ -11068,7 +11199,7 @@ struct llm_build_context {
11068
11199
 
11069
11200
  // self-attention
11070
11201
  {
11071
- cur = lm_ggml_mul_mat(ctx0, model.layers[il].wqkv, cur);
11202
+ cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wqkv, cur);
11072
11203
  cb(cur, "wqkv", il);
11073
11204
 
11074
11205
  cur = lm_ggml_add(ctx0, cur, model.layers[il].bqkv);
@@ -11096,7 +11227,7 @@ struct llm_build_context {
11096
11227
  );
11097
11228
  cb(Kcur, "Kcur", il);
11098
11229
 
11099
- cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
11230
+ cur = llm_build_kv(ctx0, lctx, kv_self, gf,
11100
11231
  model.layers[il].wo, model.layers[il].bo,
11101
11232
  Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
11102
11233
  }
@@ -11120,7 +11251,7 @@ struct llm_build_context {
11120
11251
  LLM_NORM, cb, il);
11121
11252
  cb(cur, "ffn_norm", il);
11122
11253
 
11123
- cur = llm_build_ffn(ctx0, cur,
11254
+ cur = llm_build_ffn(ctx0, lctx, cur,
11124
11255
  model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
11125
11256
  NULL, NULL, NULL,
11126
11257
  model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
@@ -11143,7 +11274,7 @@ struct llm_build_context {
11143
11274
  LLM_NORM, cb, -1);
11144
11275
  cb(cur, "result_norm", -1);
11145
11276
 
11146
- cur = lm_ggml_mul_mat(ctx0, model.output, cur);
11277
+ cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
11147
11278
  cb(cur, "result_output", -1);
11148
11279
 
11149
11280
  lm_ggml_build_forward_expand(gf, cur);
@@ -11181,21 +11312,21 @@ struct llm_build_context {
11181
11312
  // self-attention
11182
11313
  {
11183
11314
  // compute Q and K and RoPE them
11184
- struct lm_ggml_tensor * Qcur = lm_ggml_mul_mat(ctx0, model.layers[il].wq, cur);
11315
+ struct lm_ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
11185
11316
  cb(Qcur, "Qcur", il);
11186
11317
  // if (model.layers[il].bq) {
11187
11318
  // Qcur = lm_ggml_add(ctx0, Qcur, model.layers[il].bq);
11188
11319
  // cb(Qcur, "Qcur", il);
11189
11320
  // }
11190
11321
 
11191
- struct lm_ggml_tensor * Kcur = lm_ggml_mul_mat(ctx0, model.layers[il].wk, cur);
11322
+ struct lm_ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
11192
11323
  cb(Kcur, "Kcur", il);
11193
11324
  // if (model.layers[il].bk) {
11194
11325
  // Kcur = lm_ggml_add(ctx0, Kcur, model.layers[il].bk);
11195
11326
  // cb(Kcur, "Kcur", il);
11196
11327
  // }
11197
11328
 
11198
- struct lm_ggml_tensor * Vcur = lm_ggml_mul_mat(ctx0, model.layers[il].wv, cur);
11329
+ struct lm_ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
11199
11330
  cb(Vcur, "Vcur", il);
11200
11331
  // if (model.layers[il].bv) {
11201
11332
  // Vcur = lm_ggml_add(ctx0, Vcur, model.layers[il].bv);
@@ -11216,7 +11347,7 @@ struct llm_build_context {
11216
11347
  );
11217
11348
  cb(Kcur, "Kcur", il);
11218
11349
 
11219
- cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
11350
+ cur = llm_build_kv(ctx0, lctx, kv_self, gf,
11220
11351
  model.layers[il].wo, NULL,
11221
11352
  Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
11222
11353
  }
@@ -11237,7 +11368,7 @@ struct llm_build_context {
11237
11368
  LLM_NORM, cb, il);
11238
11369
  cb(cur, "ffn_norm", il);
11239
11370
 
11240
- cur = llm_build_ffn(ctx0, cur,
11371
+ cur = llm_build_ffn(ctx0, lctx, cur,
11241
11372
  model.layers[il].ffn_up, NULL, NULL,
11242
11373
  model.layers[il].ffn_gate, NULL, NULL,
11243
11374
  model.layers[il].ffn_down, NULL, NULL,
@@ -11261,7 +11392,7 @@ struct llm_build_context {
11261
11392
  cb(cur, "result_norm", -1);
11262
11393
 
11263
11394
  // lm_head
11264
- cur = lm_ggml_mul_mat(ctx0, model.output, cur);
11395
+ cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
11265
11396
  cb(cur, "result_output", -1);
11266
11397
 
11267
11398
  lm_ggml_build_forward_expand(gf, cur);
@@ -11299,21 +11430,21 @@ struct llm_build_context {
11299
11430
  // self-attention
11300
11431
  {
11301
11432
  // compute Q and K and RoPE them
11302
- struct lm_ggml_tensor * Qcur = lm_ggml_mul_mat(ctx0, model.layers[il].wq, cur);
11433
+ struct lm_ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
11303
11434
  cb(Qcur, "Qcur", il);
11304
11435
  if (model.layers[il].bq) {
11305
11436
  Qcur = lm_ggml_add(ctx0, Qcur, model.layers[il].bq);
11306
11437
  cb(Qcur, "Qcur", il);
11307
11438
  }
11308
11439
 
11309
- struct lm_ggml_tensor * Kcur = lm_ggml_mul_mat(ctx0, model.layers[il].wk, cur);
11440
+ struct lm_ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
11310
11441
  cb(Kcur, "Kcur", il);
11311
11442
  if (model.layers[il].bk) {
11312
11443
  Kcur = lm_ggml_add(ctx0, Kcur, model.layers[il].bk);
11313
11444
  cb(Kcur, "Kcur", il);
11314
11445
  }
11315
11446
 
11316
- struct lm_ggml_tensor * Vcur = lm_ggml_mul_mat(ctx0, model.layers[il].wv, cur);
11447
+ struct lm_ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
11317
11448
  cb(Vcur, "Vcur", il);
11318
11449
  if (model.layers[il].bv) {
11319
11450
  Vcur = lm_ggml_add(ctx0, Vcur, model.layers[il].bv);
@@ -11334,7 +11465,7 @@ struct llm_build_context {
11334
11465
  );
11335
11466
  cb(Kcur, "Kcur", il);
11336
11467
 
11337
- cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
11468
+ cur = llm_build_kv(ctx0, lctx, kv_self, gf,
11338
11469
  model.layers[il].wo, model.layers[il].bo,
11339
11470
  Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
11340
11471
  }
@@ -11355,7 +11486,7 @@ struct llm_build_context {
11355
11486
  LLM_NORM_RMS, cb, il);
11356
11487
  cb(cur, "ffn_norm", il);
11357
11488
 
11358
- cur = llm_build_ffn(ctx0, cur,
11489
+ cur = llm_build_ffn(ctx0, lctx, cur,
11359
11490
  model.layers[il].ffn_up, NULL, NULL,
11360
11491
  model.layers[il].ffn_gate, NULL, NULL,
11361
11492
  model.layers[il].ffn_down, NULL, NULL,
@@ -11379,7 +11510,7 @@ struct llm_build_context {
11379
11510
  cb(cur, "result_norm", -1);
11380
11511
 
11381
11512
  // lm_head
11382
- cur = lm_ggml_mul_mat(ctx0, model.output, cur);
11513
+ cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
11383
11514
  cb(cur, "result_output", -1);
11384
11515
 
11385
11516
  lm_ggml_build_forward_expand(gf, cur);
@@ -11430,21 +11561,21 @@ struct llm_build_context {
11430
11561
  // self-attention
11431
11562
  {
11432
11563
  // compute Q and K and RoPE them
11433
- struct lm_ggml_tensor * Qcur = lm_ggml_mul_mat(ctx0, model.layers[il].wq, cur);
11564
+ struct lm_ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
11434
11565
  cb(Qcur, "Qcur", il);
11435
11566
  if (model.layers[il].bq) {
11436
11567
  Qcur = lm_ggml_add(ctx0, Qcur, model.layers[il].bq);
11437
11568
  cb(Qcur, "Qcur", il);
11438
11569
  }
11439
11570
 
11440
- struct lm_ggml_tensor * Kcur = lm_ggml_mul_mat(ctx0, model.layers[il].wk, cur);
11571
+ struct lm_ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
11441
11572
  cb(Kcur, "Kcur", il);
11442
11573
  if (model.layers[il].bk) {
11443
11574
  Kcur = lm_ggml_add(ctx0, Kcur, model.layers[il].bk);
11444
11575
  cb(Kcur, "Kcur", il);
11445
11576
  }
11446
11577
 
11447
- struct lm_ggml_tensor * Vcur = lm_ggml_mul_mat(ctx0, model.layers[il].wv, cur);
11578
+ struct lm_ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
11448
11579
  cb(Vcur, "Vcur", il);
11449
11580
  if (model.layers[il].bv) {
11450
11581
  Vcur = lm_ggml_add(ctx0, Vcur, model.layers[il].bv);
@@ -11465,7 +11596,7 @@ struct llm_build_context {
11465
11596
  );
11466
11597
  cb(Kcur, "Kcur", il);
11467
11598
 
11468
- cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
11599
+ cur = llm_build_kv(ctx0, lctx, kv_self, gf,
11469
11600
  model.layers[il].wo, model.layers[il].bo,
11470
11601
  Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
11471
11602
  }
@@ -11492,7 +11623,7 @@ struct llm_build_context {
11492
11623
  LLM_NORM_RMS, cb, il);
11493
11624
  cb(cur, "ffn_norm", il);
11494
11625
 
11495
- cur = llm_build_ffn(ctx0, cur,
11626
+ cur = llm_build_ffn(ctx0, lctx, cur,
11496
11627
  model.layers[il].ffn_up, NULL, NULL,
11497
11628
  model.layers[il].ffn_gate, NULL, NULL,
11498
11629
  model.layers[il].ffn_down, NULL, NULL,
@@ -11526,7 +11657,7 @@ struct llm_build_context {
11526
11657
  cb(cur, "lmhead_scaling", -1);
11527
11658
 
11528
11659
  // lm_head
11529
- cur = lm_ggml_mul_mat(ctx0, model.output, cur);
11660
+ cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
11530
11661
  cb(cur, "result_output", -1);
11531
11662
 
11532
11663
  lm_ggml_build_forward_expand(gf, cur);
@@ -11563,13 +11694,13 @@ struct llm_build_context {
11563
11694
  // self-attention
11564
11695
  {
11565
11696
  // compute Q and K and RoPE them
11566
- struct lm_ggml_tensor * Qcur = lm_ggml_mul_mat(ctx0, model.layers[il].wq, cur);
11697
+ struct lm_ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
11567
11698
  cb(Qcur, "Qcur", il);
11568
11699
 
11569
- struct lm_ggml_tensor * Kcur = lm_ggml_mul_mat(ctx0, model.layers[il].wk, cur);
11700
+ struct lm_ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
11570
11701
  cb(Kcur, "Kcur", il);
11571
11702
 
11572
- struct lm_ggml_tensor * Vcur = lm_ggml_mul_mat(ctx0, model.layers[il].wv, cur);
11703
+ struct lm_ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
11573
11704
  cb(Vcur, "Vcur", il);
11574
11705
 
11575
11706
  Qcur = lm_ggml_rope_ext(
@@ -11587,7 +11718,7 @@ struct llm_build_context {
11587
11718
  ext_factor, attn_factor, beta_fast, beta_slow);
11588
11719
  cb(Kcur, "Kcur", il);
11589
11720
 
11590
- cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
11721
+ cur = llm_build_kv(ctx0, lctx, kv_self, gf,
11591
11722
  model.layers[il].wo, NULL,
11592
11723
  Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f, cb, il);
11593
11724
  }
@@ -11609,7 +11740,7 @@ struct llm_build_context {
11609
11740
 
11610
11741
  // feed-forward network
11611
11742
  {
11612
- cur = llm_build_ffn(ctx0, cur,
11743
+ cur = llm_build_ffn(ctx0, lctx, cur,
11613
11744
  model.layers[il].ffn_up, NULL, NULL,
11614
11745
  model.layers[il].ffn_gate, NULL, NULL,
11615
11746
  model.layers[il].ffn_down, NULL, NULL,
@@ -11634,7 +11765,7 @@ struct llm_build_context {
11634
11765
  cb(cur, "result_norm", -1);
11635
11766
 
11636
11767
  // lm_head
11637
- cur = lm_ggml_mul_mat(ctx0, model.output, cur);
11768
+ cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
11638
11769
  cb(cur, "result_output", -1);
11639
11770
 
11640
11771
  lm_ggml_build_forward_expand(gf, cur);
@@ -11676,13 +11807,13 @@ struct llm_build_context {
11676
11807
  // self-attention
11677
11808
  {
11678
11809
  // compute Q and K and RoPE them
11679
- struct lm_ggml_tensor * Qcur = lm_ggml_mul_mat(ctx0, model.layers[il].wq, cur);
11810
+ struct lm_ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
11680
11811
  cb(Qcur, "Qcur", il);
11681
11812
 
11682
- struct lm_ggml_tensor * Kcur = lm_ggml_mul_mat(ctx0, model.layers[il].wk, cur);
11813
+ struct lm_ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
11683
11814
  cb(Kcur, "Kcur", il);
11684
11815
 
11685
- struct lm_ggml_tensor * Vcur = lm_ggml_mul_mat(ctx0, model.layers[il].wv, cur);
11816
+ struct lm_ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
11686
11817
  cb(Vcur, "Vcur", il);
11687
11818
 
11688
11819
  Qcur = lm_ggml_rope_ext(
@@ -11705,7 +11836,7 @@ struct llm_build_context {
11705
11836
  ext_factor, attn_factor, beta_fast, beta_slow);
11706
11837
  cb(Kcur, "Kcur", il);
11707
11838
 
11708
- cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
11839
+ cur = llm_build_kv(ctx0, lctx, kv_self, gf,
11709
11840
  model.layers[il].wo, NULL,
11710
11841
  Kcur, Vcur, Qcur, KQ_mask_l, n_tokens, kv_head, n_kv, 1.0f, cb, il);
11711
11842
  }
@@ -11732,7 +11863,7 @@ struct llm_build_context {
11732
11863
 
11733
11864
  // feed-forward network
11734
11865
  {
11735
- cur = llm_build_ffn(ctx0, cur,
11866
+ cur = llm_build_ffn(ctx0, lctx, cur,
11736
11867
  model.layers[il].ffn_up, NULL, NULL,
11737
11868
  model.layers[il].ffn_gate, NULL, NULL,
11738
11869
  model.layers[il].ffn_down, NULL, NULL,
@@ -11762,7 +11893,7 @@ struct llm_build_context {
11762
11893
  cb(cur, "result_norm", -1);
11763
11894
 
11764
11895
  // lm_head
11765
- cur = lm_ggml_mul_mat(ctx0, model.output, cur);
11896
+ cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
11766
11897
 
11767
11898
  // final logit soft-capping
11768
11899
  cur = lm_ggml_scale(ctx0, cur, 1.0f / hparams.f_final_logit_softcapping);
@@ -11807,21 +11938,21 @@ struct llm_build_context {
11807
11938
  // self-attention
11808
11939
  {
11809
11940
  // compute Q and K and RoPE them
11810
- struct lm_ggml_tensor * Qcur = lm_ggml_mul_mat(ctx0, model.layers[il].wq, cur);
11941
+ struct lm_ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
11811
11942
  cb(Qcur, "Qcur", il);
11812
11943
  if (model.layers[il].bq) {
11813
11944
  Qcur = lm_ggml_add(ctx0, Qcur, model.layers[il].bq);
11814
11945
  cb(Qcur, "Qcur", il);
11815
11946
  }
11816
11947
 
11817
- struct lm_ggml_tensor * Kcur = lm_ggml_mul_mat(ctx0, model.layers[il].wk, cur);
11948
+ struct lm_ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
11818
11949
  cb(Kcur, "Kcur", il);
11819
11950
  if (model.layers[il].bk) {
11820
11951
  Kcur = lm_ggml_add(ctx0, Kcur, model.layers[il].bk);
11821
11952
  cb(Kcur, "Kcur", il);
11822
11953
  }
11823
11954
 
11824
- struct lm_ggml_tensor * Vcur = lm_ggml_mul_mat(ctx0, model.layers[il].wv, cur);
11955
+ struct lm_ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
11825
11956
  cb(Vcur, "Vcur", il);
11826
11957
  if (model.layers[il].bv) {
11827
11958
  Vcur = lm_ggml_add(ctx0, Vcur, model.layers[il].bv);
@@ -11842,7 +11973,7 @@ struct llm_build_context {
11842
11973
  );
11843
11974
  cb(Kcur, "Kcur", il);
11844
11975
 
11845
- cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
11976
+ cur = llm_build_kv(ctx0, lctx, kv_self, gf,
11846
11977
  model.layers[il].wo, model.layers[il].bo,
11847
11978
  Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
11848
11979
  }
@@ -11864,7 +11995,7 @@ struct llm_build_context {
11864
11995
  LLM_NORM, cb, il);
11865
11996
  cb(cur, "ffn_norm", il);
11866
11997
 
11867
- cur = llm_build_ffn(ctx0, cur,
11998
+ cur = llm_build_ffn(ctx0, lctx, cur,
11868
11999
  model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
11869
12000
  NULL, NULL, NULL,
11870
12001
  model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
@@ -11888,7 +12019,7 @@ struct llm_build_context {
11888
12019
  cb(cur, "result_norm", -1);
11889
12020
 
11890
12021
  // lm_head
11891
- cur = lm_ggml_mul_mat(ctx0, model.output, cur);
12022
+ cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
11892
12023
  cb(cur, "result_output", -1);
11893
12024
 
11894
12025
  lm_ggml_build_forward_expand(gf, cur);
@@ -11940,7 +12071,7 @@ struct llm_build_context {
11940
12071
  cb(cur, "attn_norm", il);
11941
12072
 
11942
12073
  // {n_embd, 2*d_inner} * {n_embd, n_tokens} => {2*d_inner, n_tokens}
11943
- struct lm_ggml_tensor * xz = lm_ggml_mul_mat(ctx0, model.layers[il].ssm_in, cur);
12074
+ struct lm_ggml_tensor * xz = llm_build_lora_mm(lctx, ctx0, model.layers[il].ssm_in, cur);
11944
12075
  // split the above in two
11945
12076
  // => {d_inner, n_tokens}
11946
12077
  struct lm_ggml_tensor * x = lm_ggml_view_2d(ctx0, xz, d_inner, xz->ne[1], xz->nb[1], 0);
@@ -11979,14 +12110,14 @@ struct llm_build_context {
11979
12110
  // ssm
11980
12111
  {
11981
12112
  // {d_inner, dt_rank + 2*d_state} * {d_inner, n_tokens} => {dt_rank + 2*d_state, n_tokens}
11982
- struct lm_ggml_tensor * x_db = lm_ggml_mul_mat(ctx0, model.layers[il].ssm_x, x);
12113
+ struct lm_ggml_tensor * x_db = llm_build_lora_mm(lctx, ctx0, model.layers[il].ssm_x, x);
11983
12114
  // split
11984
12115
  struct lm_ggml_tensor * dt = lm_ggml_view_2d(ctx0, x_db, dt_rank, n_tokens, x_db->nb[1], 0);
11985
12116
  struct lm_ggml_tensor * B = lm_ggml_view_2d(ctx0, x_db, d_state, n_tokens, x_db->nb[1], lm_ggml_element_size(x_db)*dt_rank);
11986
12117
  struct lm_ggml_tensor * C = lm_ggml_view_2d(ctx0, x_db, d_state, n_tokens, x_db->nb[1], lm_ggml_element_size(x_db)*(dt_rank+d_state));
11987
12118
 
11988
12119
  // {dt_rank, d_inner} * {dt_rank, n_tokens} => {d_inner, n_tokens}
11989
- dt = lm_ggml_mul_mat(ctx0, model.layers[il].ssm_dt, dt);
12120
+ dt = llm_build_lora_mm(lctx, ctx0, model.layers[il].ssm_dt, dt);
11990
12121
  dt = lm_ggml_add(ctx0, dt, model.layers[il].ssm_dt_b);
11991
12122
 
11992
12123
  // Custom operator to optimize the parallel associative scan
@@ -12017,7 +12148,7 @@ struct llm_build_context {
12017
12148
  y = lm_ggml_mul(ctx0, y, lm_ggml_silu(ctx0, z));
12018
12149
 
12019
12150
  // {d_inner, n_embd} * {d_inner, n_tokens} => {n_embd, n_tokens}
12020
- cur = lm_ggml_mul_mat(ctx0, model.layers[il].ssm_out, y);
12151
+ cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].ssm_out, y);
12021
12152
  }
12022
12153
 
12023
12154
  // residual
@@ -12036,7 +12167,7 @@ struct llm_build_context {
12036
12167
  cb(cur, "result_norm", -1);
12037
12168
 
12038
12169
  // lm_head
12039
- cur = lm_ggml_mul_mat(ctx0, model.output, cur);
12170
+ cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
12040
12171
  cb(cur, "result_output", -1);
12041
12172
 
12042
12173
  lm_ggml_build_forward_expand(gf, cur);
@@ -12075,21 +12206,21 @@ struct llm_build_context {
12075
12206
  // self-attention
12076
12207
  {
12077
12208
  // compute Q and K and RoPE them
12078
- struct lm_ggml_tensor * Qcur = lm_ggml_mul_mat(ctx0, model.layers[il].wq, cur);
12209
+ struct lm_ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
12079
12210
  cb(Qcur, "Qcur", il);
12080
12211
  if (model.layers[il].bq) {
12081
12212
  Qcur = lm_ggml_add(ctx0, Qcur, model.layers[il].bq);
12082
12213
  cb(Qcur, "Qcur", il);
12083
12214
  }
12084
12215
 
12085
- struct lm_ggml_tensor * Kcur = lm_ggml_mul_mat(ctx0, model.layers[il].wk, cur);
12216
+ struct lm_ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
12086
12217
  cb(Kcur, "Kcur", il);
12087
12218
  if (model.layers[il].bk) {
12088
12219
  Kcur = lm_ggml_add(ctx0, Kcur, model.layers[il].bk);
12089
12220
  cb(Kcur, "Kcur", il);
12090
12221
  }
12091
12222
 
12092
- struct lm_ggml_tensor * Vcur = lm_ggml_mul_mat(ctx0, model.layers[il].wv, cur);
12223
+ struct lm_ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
12093
12224
  cb(Vcur, "Vcur", il);
12094
12225
  if (model.layers[il].bv) {
12095
12226
  Vcur = lm_ggml_add(ctx0, Vcur, model.layers[il].bv);
@@ -12135,7 +12266,7 @@ struct llm_build_context {
12135
12266
  );
12136
12267
  cb(Kcur, "Kcur", il);
12137
12268
 
12138
- cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
12269
+ cur = llm_build_kv(ctx0, lctx, kv_self, gf,
12139
12270
  model.layers[il].wo, model.layers[il].bo,
12140
12271
  Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
12141
12272
  }
@@ -12152,7 +12283,7 @@ struct llm_build_context {
12152
12283
 
12153
12284
  // feed-forward network
12154
12285
  {
12155
- cur = llm_build_ffn(ctx0, ffn_inp,
12286
+ cur = llm_build_ffn(ctx0, lctx, ffn_inp,
12156
12287
  model.layers[il].ffn_up, NULL, NULL,
12157
12288
  model.layers[il].ffn_gate, NULL, NULL,
12158
12289
  model.layers[il].ffn_down, NULL, NULL,
@@ -12179,7 +12310,7 @@ struct llm_build_context {
12179
12310
  cb(cur, "result_norm", -1);
12180
12311
 
12181
12312
  // lm_head
12182
- cur = lm_ggml_mul_mat(ctx0, model.output, cur);
12313
+ cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
12183
12314
 
12184
12315
  if (f_logit_scale) {
12185
12316
  cur = lm_ggml_scale(ctx0, cur, f_logit_scale);
@@ -12232,21 +12363,21 @@ struct llm_build_context {
12232
12363
  // self-attention
12233
12364
  {
12234
12365
  // compute Q and K and RoPE them
12235
- struct lm_ggml_tensor * Qcur = lm_ggml_mul_mat(ctx0, model.layers[il].wq, cur);
12366
+ struct lm_ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
12236
12367
  cb(Qcur, "Qcur", il);
12237
12368
  if (hparams.f_clamp_kqv > 0.0f) {
12238
12369
  Qcur = lm_ggml_clamp(ctx0, Qcur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv);
12239
12370
  cb(Qcur, "Qcur", il);
12240
12371
  }
12241
12372
 
12242
- struct lm_ggml_tensor * Kcur = lm_ggml_mul_mat(ctx0, model.layers[il].wk, cur);
12373
+ struct lm_ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
12243
12374
  cb(Kcur, "Kcur", il);
12244
12375
  if (hparams.f_clamp_kqv > 0.0f) {
12245
12376
  Kcur = lm_ggml_clamp(ctx0, Kcur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv);
12246
12377
  cb(Kcur, "Kcur", il);
12247
12378
  }
12248
12379
 
12249
- struct lm_ggml_tensor * Vcur = lm_ggml_mul_mat(ctx0, model.layers[il].wv, cur);
12380
+ struct lm_ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
12250
12381
  cb(Vcur, "Vcur", il);
12251
12382
  if (hparams.f_clamp_kqv > 0.0f) {
12252
12383
  Vcur = lm_ggml_clamp(ctx0, Vcur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv);
@@ -12267,7 +12398,7 @@ struct llm_build_context {
12267
12398
  );
12268
12399
  cb(Kcur, "Kcur", il);
12269
12400
 
12270
- cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
12401
+ cur = llm_build_kv(ctx0, lctx, kv_self, gf,
12271
12402
  model.layers[il].wo, nullptr,
12272
12403
  Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
12273
12404
  }
@@ -12289,7 +12420,7 @@ struct llm_build_context {
12289
12420
  LLM_NORM, cb, il);
12290
12421
  cb(cur, "ffn_norm", il);
12291
12422
 
12292
- cur = llm_build_ffn(ctx0, cur,
12423
+ cur = llm_build_ffn(ctx0, lctx, cur,
12293
12424
  model.layers[il].ffn_up, NULL, NULL,
12294
12425
  model.layers[il].ffn_gate, NULL, NULL,
12295
12426
  model.layers[il].ffn_down, NULL, NULL,
@@ -12315,7 +12446,7 @@ struct llm_build_context {
12315
12446
  cb(cur, "result_norm", -1);
12316
12447
 
12317
12448
  // lm_head
12318
- cur = lm_ggml_mul_mat(ctx0, model.output, cur);
12449
+ cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
12319
12450
  cb(cur, "result_output", -1);
12320
12451
 
12321
12452
  lm_ggml_build_forward_expand(gf, cur);
@@ -12355,7 +12486,7 @@ struct llm_build_context {
12355
12486
 
12356
12487
  // self-attention
12357
12488
  {
12358
- cur = lm_ggml_mul_mat(ctx0, model.layers[il].wqkv, cur);
12489
+ cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wqkv, cur);
12359
12490
  cb(cur, "wqkv", il);
12360
12491
 
12361
12492
  cur = lm_ggml_reshape_3d(ctx0, cur, n_embd_head_k, n_head_qkv, n_tokens);
@@ -12394,7 +12525,7 @@ struct llm_build_context {
12394
12525
  Vcur = lm_ggml_reshape_2d(ctx0, Vcur, n_embd_head * n_head_kv, n_tokens);
12395
12526
  cb(Qcur, "Vcur", il);
12396
12527
 
12397
- cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
12528
+ cur = llm_build_kv(ctx0, lctx, kv_self, gf,
12398
12529
  model.layers[il].wo, NULL,
12399
12530
  Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
12400
12531
  }
@@ -12416,7 +12547,7 @@ struct llm_build_context {
12416
12547
  LLM_NORM_RMS, cb, il);
12417
12548
  cb(cur, "ffn_norm", il);
12418
12549
 
12419
- cur = llm_build_ffn(ctx0, cur,
12550
+ cur = llm_build_ffn(ctx0, lctx, cur,
12420
12551
  model.layers[il].ffn_up, NULL, NULL,
12421
12552
  model.layers[il].ffn_gate, NULL, NULL,
12422
12553
  model.layers[il].ffn_down, NULL, NULL,
@@ -12440,7 +12571,7 @@ struct llm_build_context {
12440
12571
  LLM_NORM_RMS, cb, -1);
12441
12572
  cb(cur, "result_norm", -1);
12442
12573
 
12443
- cur = lm_ggml_mul_mat(ctx0, model.output, cur);
12574
+ cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
12444
12575
  cb(cur, "result_output", -1);
12445
12576
 
12446
12577
  lm_ggml_build_forward_expand(gf, cur);
@@ -12475,7 +12606,7 @@ struct llm_build_context {
12475
12606
 
12476
12607
  // self-attention
12477
12608
  {
12478
- cur = lm_ggml_mul_mat(ctx0, model.layers[il].wqkv, cur);
12609
+ cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wqkv, cur);
12479
12610
  cb(cur, "wqkv", il);
12480
12611
 
12481
12612
  cur = lm_ggml_add(ctx0, cur, model.layers[il].bqkv);
@@ -12503,7 +12634,7 @@ struct llm_build_context {
12503
12634
  );
12504
12635
  cb(Kcur, "Kcur", il);
12505
12636
 
12506
- cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
12637
+ cur = llm_build_kv(ctx0, lctx, kv_self, gf,
12507
12638
  model.layers[il].wo, model.layers[il].bo,
12508
12639
  Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
12509
12640
  }
@@ -12528,7 +12659,7 @@ struct llm_build_context {
12528
12659
  LLM_NORM, cb, il);
12529
12660
  cb(cur, "ffn_norm", il);
12530
12661
 
12531
- cur = llm_build_ffn(ctx0, cur,
12662
+ cur = llm_build_ffn(ctx0, lctx, cur,
12532
12663
  model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
12533
12664
  NULL, NULL, NULL,
12534
12665
  model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
@@ -12559,7 +12690,7 @@ struct llm_build_context {
12559
12690
  LLM_NORM, cb, il);
12560
12691
  cb(cur, "ffn_norm", il);
12561
12692
 
12562
- cur = llm_build_ffn(ctx0, cur,
12693
+ cur = llm_build_ffn(ctx0, lctx, cur,
12563
12694
  model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
12564
12695
  NULL, NULL, NULL,
12565
12696
  model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
@@ -12582,7 +12713,7 @@ struct llm_build_context {
12582
12713
  LLM_NORM, cb, -1);
12583
12714
  cb(cur, "result_norm", -1);
12584
12715
 
12585
- cur = lm_ggml_mul_mat(ctx0, model.output, cur);
12716
+ cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
12586
12717
  cb(cur, "result_output", -1);
12587
12718
 
12588
12719
  lm_ggml_build_forward_expand(gf, cur);
@@ -12623,13 +12754,13 @@ struct llm_build_context {
12623
12754
  // self-attention
12624
12755
  {
12625
12756
  // compute Q and K and RoPE them
12626
- struct lm_ggml_tensor * Qcur = lm_ggml_mul_mat(ctx0, model.layers[il].wq, cur);
12757
+ struct lm_ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
12627
12758
  cb(Qcur, "Qcur", il);
12628
12759
 
12629
- struct lm_ggml_tensor * Kcur = lm_ggml_mul_mat(ctx0, model.layers[il].wk, cur);
12760
+ struct lm_ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
12630
12761
  cb(Kcur, "Kcur", il);
12631
12762
 
12632
- struct lm_ggml_tensor * Vcur = lm_ggml_mul_mat(ctx0, model.layers[il].wv, cur);
12763
+ struct lm_ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
12633
12764
  cb(Vcur, "Vcur", il);
12634
12765
 
12635
12766
  Qcur = lm_ggml_rope_ext(
@@ -12646,7 +12777,7 @@ struct llm_build_context {
12646
12777
  );
12647
12778
  cb(Kcur, "Kcur", il);
12648
12779
 
12649
- cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
12780
+ cur = llm_build_kv(ctx0, lctx, kv_self, gf,
12650
12781
  model.layers[il].wo, NULL,
12651
12782
  Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
12652
12783
  }
@@ -12668,7 +12799,7 @@ struct llm_build_context {
12668
12799
  LLM_NORM_RMS, cb, il);
12669
12800
  cb(cur, "ffn_norm", il);
12670
12801
 
12671
- cur = llm_build_ffn(ctx0, cur,
12802
+ cur = llm_build_ffn(ctx0, lctx, cur,
12672
12803
  model.layers[il].ffn_up, NULL, NULL,
12673
12804
  model.layers[il].ffn_gate, NULL, NULL,
12674
12805
  model.layers[il].ffn_down, NULL, NULL,
@@ -12685,7 +12816,7 @@ struct llm_build_context {
12685
12816
  LLM_NORM_RMS, cb, il);
12686
12817
  cb(cur, "ffn_norm_exps", il);
12687
12818
 
12688
- cur = llm_build_moe_ffn(ctx0, cur,
12819
+ cur = llm_build_moe_ffn(ctx0, lctx, cur,
12689
12820
  model.layers[il].ffn_gate_inp,
12690
12821
  model.layers[il].ffn_up_exps,
12691
12822
  model.layers[il].ffn_gate_exps,
@@ -12714,7 +12845,7 @@ struct llm_build_context {
12714
12845
  cb(cur, "result_norm", -1);
12715
12846
 
12716
12847
  // lm_head
12717
- cur = lm_ggml_mul_mat(ctx0, model.output, cur);
12848
+ cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
12718
12849
  cb(cur, "result_output", -1);
12719
12850
 
12720
12851
  lm_ggml_build_forward_expand(gf, cur);
@@ -12868,7 +12999,7 @@ struct llm_build_context {
12868
12999
  struct lm_ggml_tensor * k_states = lm_ggml_concat(ctx0, k_nope, lm_ggml_repeat(ctx0, k_pe, q_pe), 0);
12869
13000
  cb(k_states, "k_states", il);
12870
13001
 
12871
- cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
13002
+ cur = llm_build_kv(ctx0, lctx, kv_self, gf,
12872
13003
  model.layers[il].wo, NULL,
12873
13004
  k_states, v_states, q_states, KQ_mask, n_tokens, kv_head, n_kv, kq_scale, cb, il);
12874
13005
  }
@@ -12890,7 +13021,7 @@ struct llm_build_context {
12890
13021
  cb(cur, "ffn_norm", il);
12891
13022
 
12892
13023
  if ((uint32_t) il < hparams.n_layer_dense_lead) {
12893
- cur = llm_build_ffn(ctx0, cur,
13024
+ cur = llm_build_ffn(ctx0, lctx, cur,
12894
13025
  model.layers[il].ffn_up, NULL, NULL,
12895
13026
  model.layers[il].ffn_gate, NULL, NULL,
12896
13027
  model.layers[il].ffn_down, NULL, NULL,
@@ -12900,7 +13031,7 @@ struct llm_build_context {
12900
13031
  } else {
12901
13032
  // MoE branch
12902
13033
  lm_ggml_tensor * moe_out =
12903
- llm_build_moe_ffn(ctx0, cur,
13034
+ llm_build_moe_ffn(ctx0, lctx, cur,
12904
13035
  model.layers[il].ffn_gate_inp,
12905
13036
  model.layers[il].ffn_up_exps,
12906
13037
  model.layers[il].ffn_gate_exps,
@@ -12913,7 +13044,7 @@ struct llm_build_context {
12913
13044
 
12914
13045
  // FFN shared expert
12915
13046
  {
12916
- lm_ggml_tensor * ffn_shexp = llm_build_ffn(ctx0, cur,
13047
+ lm_ggml_tensor * ffn_shexp = llm_build_ffn(ctx0, lctx, cur,
12917
13048
  model.layers[il].ffn_up_shexp, NULL, NULL,
12918
13049
  model.layers[il].ffn_gate_shexp, NULL, NULL,
12919
13050
  model.layers[il].ffn_down_shexp, NULL, NULL,
@@ -12978,7 +13109,7 @@ struct llm_build_context {
12978
13109
  // self-attention
12979
13110
  {
12980
13111
  // compute Q and K and RoPE them
12981
- struct lm_ggml_tensor * Qcur = lm_ggml_mul_mat(ctx0, model.layers[il].wq, cur);
13112
+ struct lm_ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
12982
13113
  Qcur = lm_ggml_mul(ctx0, Qcur, model.layers[il].wq_scale);
12983
13114
  cb(Qcur, "Qcur", il);
12984
13115
  if (model.layers[il].bq) {
@@ -12987,7 +13118,7 @@ struct llm_build_context {
12987
13118
  }
12988
13119
 
12989
13120
  // B1.K
12990
- struct lm_ggml_tensor * Kcur = lm_ggml_mul_mat(ctx0, model.layers[il].wk, cur);
13121
+ struct lm_ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
12991
13122
  Kcur = lm_ggml_mul(ctx0, Kcur, model.layers[il].wk_scale);
12992
13123
  cb(Kcur, "Kcur", il);
12993
13124
  if (model.layers[il].bk) {
@@ -12996,7 +13127,7 @@ struct llm_build_context {
12996
13127
  }
12997
13128
 
12998
13129
  // B1.V
12999
- struct lm_ggml_tensor * Vcur = lm_ggml_mul_mat(ctx0, model.layers[il].wv, cur);
13130
+ struct lm_ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
13000
13131
  Vcur = lm_ggml_mul(ctx0, Vcur, model.layers[il].wv_scale);
13001
13132
  cb(Vcur, "Vcur", il);
13002
13133
  if (model.layers[il].bv) {
@@ -13018,7 +13149,7 @@ struct llm_build_context {
13018
13149
  );
13019
13150
  cb(Kcur, "Kcur", il);
13020
13151
 
13021
- cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
13152
+ cur = llm_build_kv(ctx0, lctx, kv_self, gf,
13022
13153
  NULL, NULL,
13023
13154
  Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
13024
13155
 
@@ -13027,7 +13158,7 @@ struct llm_build_context {
13027
13158
  LLM_NORM_RMS, cb, il);
13028
13159
  cb(cur, "attn_sub_norm", il);
13029
13160
 
13030
- cur = lm_ggml_mul_mat(ctx0, model.layers[il].wo, cur);
13161
+ cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wo, cur);
13031
13162
  cur = lm_ggml_mul(ctx0, cur, model.layers[il].wo_scale);
13032
13163
  if (model.layers[il].bo) {
13033
13164
  cur = lm_ggml_add(ctx0, cur, model.layers[il].bo);
@@ -13051,7 +13182,7 @@ struct llm_build_context {
13051
13182
  LLM_NORM_RMS, cb, il);
13052
13183
  cb(cur, "ffn_norm", il);
13053
13184
 
13054
- cur = llm_build_ffn(ctx0, cur,
13185
+ cur = llm_build_ffn(ctx0, lctx, cur,
13055
13186
  model.layers[il].ffn_up, NULL, model.layers[il].ffn_up_scale,
13056
13187
  model.layers[il].ffn_gate, NULL, model.layers[il].ffn_gate_scale,
13057
13188
  NULL, NULL, NULL,
@@ -13064,7 +13195,7 @@ struct llm_build_context {
13064
13195
  LLM_NORM_RMS, cb, il);
13065
13196
  cb(cur, "ffn_sub_norm", il);
13066
13197
 
13067
- cur = lm_ggml_mul_mat(ctx0, model.layers[il].ffn_down, cur);
13198
+ cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].ffn_down, cur);
13068
13199
  cur = lm_ggml_mul(ctx0, cur, model.layers[il].ffn_down_scale);
13069
13200
  cb(cur, "ffn_down", il);
13070
13201
 
@@ -13083,7 +13214,7 @@ struct llm_build_context {
13083
13214
  cb(cur, "result_norm", -1);
13084
13215
 
13085
13216
  // lm_head
13086
- cur = lm_ggml_mul_mat(ctx0, model.tok_embd, cur);
13217
+ cur = llm_build_lora_mm(lctx, ctx0, model.tok_embd, cur);
13087
13218
  cb(cur, "result_output", -1);
13088
13219
 
13089
13220
  lm_ggml_build_forward_expand(gf, cur);
@@ -13185,7 +13316,7 @@ struct llm_build_context {
13185
13316
  cb(cur, "ffn_norm", il);
13186
13317
 
13187
13318
  // T5 uses relu, flan-T5 uses gelu-gated
13188
- cur = llm_build_ffn(ctx0, cur,
13319
+ cur = llm_build_ffn(ctx0, lctx, cur,
13189
13320
  model.layers[il].ffn_up_enc, NULL, NULL,
13190
13321
  model.layers[il].ffn_gate_enc, NULL, NULL,
13191
13322
  model.layers[il].ffn_down_enc, NULL, NULL,
@@ -13365,7 +13496,7 @@ struct llm_build_context {
13365
13496
  cb(cur, "ffn_norm", il);
13366
13497
 
13367
13498
  // T5 uses relu, flan-T5 uses gelu-gated
13368
- cur = llm_build_ffn(ctx0, cur,
13499
+ cur = llm_build_ffn(ctx0, lctx, cur,
13369
13500
  model.layers[il].ffn_up, NULL, NULL,
13370
13501
  model.layers[il].ffn_gate, NULL, NULL,
13371
13502
  model.layers[il].ffn_down, NULL, NULL,
@@ -13431,7 +13562,7 @@ struct llm_build_context {
13431
13562
 
13432
13563
  // self-attention
13433
13564
  {
13434
- cur = lm_ggml_mul_mat(ctx0, model.layers[il].wqkv, cur);
13565
+ cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wqkv, cur);
13435
13566
  cb(cur, "wqkv", il);
13436
13567
 
13437
13568
  cur = lm_ggml_add(ctx0, cur, model.layers[il].bqkv);
@@ -13447,7 +13578,7 @@ struct llm_build_context {
13447
13578
 
13448
13579
  Qcur = lm_ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
13449
13580
 
13450
- cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
13581
+ cur = llm_build_kv(ctx0, lctx, kv_self, gf,
13451
13582
  model.layers[il].wo, model.layers[il].bo,
13452
13583
  Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/float(n_embd_head), cb, il);
13453
13584
  }
@@ -13471,7 +13602,7 @@ struct llm_build_context {
13471
13602
  LLM_NORM, cb, il);
13472
13603
  cb(cur, "ffn_norm", il);
13473
13604
 
13474
- cur = llm_build_ffn(ctx0, cur,
13605
+ cur = llm_build_ffn(ctx0, lctx, cur,
13475
13606
  model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
13476
13607
  model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL,
13477
13608
  model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
@@ -13490,7 +13621,7 @@ struct llm_build_context {
13490
13621
  LLM_NORM, cb, -1);
13491
13622
  cb(cur, "result_norm", -1);
13492
13623
 
13493
- cur = lm_ggml_mul_mat(ctx0, model.output, cur);
13624
+ cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
13494
13625
 
13495
13626
  cb(cur, "result_output", -1);
13496
13627
 
@@ -13532,7 +13663,7 @@ struct llm_build_context {
13532
13663
  struct lm_ggml_tensor * Kcur = nullptr;
13533
13664
  struct lm_ggml_tensor * Vcur = nullptr;
13534
13665
 
13535
- cur = lm_ggml_mul_mat(ctx0, model.layers[il].wqkv, cur);
13666
+ cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wqkv, cur);
13536
13667
  cb(cur, "wqkv", il);
13537
13668
 
13538
13669
  cur = lm_ggml_add(ctx0, cur, model.layers[il].bqkv);
@@ -13560,7 +13691,7 @@ struct llm_build_context {
13560
13691
  );
13561
13692
  cb(Kcur, "Kcur_rope", il);
13562
13693
 
13563
- cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
13694
+ cur = llm_build_kv(ctx0, lctx, kv_self, gf,
13564
13695
  model.layers[il].wo, NULL,
13565
13696
  Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
13566
13697
 
@@ -13585,7 +13716,7 @@ struct llm_build_context {
13585
13716
  LLM_NORM_RMS, cb, il);
13586
13717
  cb(cur, "ffn_norm", il);
13587
13718
 
13588
- cur = llm_build_ffn(ctx0, cur,
13719
+ cur = llm_build_ffn(ctx0, lctx, cur,
13589
13720
  model.layers[il].ffn_up, NULL, NULL,
13590
13721
  NULL, NULL, NULL,
13591
13722
  model.layers[il].ffn_down, NULL, NULL,
@@ -13605,7 +13736,7 @@ struct llm_build_context {
13605
13736
  LLM_NORM_RMS, cb, -1);
13606
13737
  cb(cur, "result_norm", -1);
13607
13738
 
13608
- cur = lm_ggml_mul_mat(ctx0, model.output, cur);
13739
+ cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
13609
13740
  cb(cur, "result_output", -1);
13610
13741
 
13611
13742
  lm_ggml_build_forward_expand(gf, cur);
@@ -15032,6 +15163,10 @@ static void llama_kv_cache_update_internal(struct llama_context & lctx) {
15032
15163
 
15033
15164
  // apply K-shift if needed
15034
15165
  if (lctx.model.hparams.rope_type != LLAMA_ROPE_TYPE_NONE && lctx.kv_self.has_shift) {
15166
+ if (lctx.model.arch == LLM_ARCH_DEEPSEEK2) { // not supported due to MLA
15167
+ LM_GGML_ASSERT(false && "Deepseek2 does not support K-shift");
15168
+ }
15169
+
15035
15170
  {
15036
15171
  lm_ggml_backend_sched_reset(lctx.sched);
15037
15172
 
@@ -15426,6 +15561,8 @@ struct llm_tokenizer_bpe {
15426
15561
  case LLAMA_VOCAB_PRE_TYPE_STARCODER:
15427
15562
  case LLAMA_VOCAB_PRE_TYPE_REFACT:
15428
15563
  case LLAMA_VOCAB_PRE_TYPE_COMMAND_R:
15564
+ case LLAMA_VOCAB_PRE_TYPE_SMOLLM:
15565
+ case LLAMA_VOCAB_PRE_TYPE_CODESHELL:
15429
15566
  regex_exprs = {
15430
15567
  "\\p{N}",
15431
15568
  "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
@@ -15463,6 +15600,13 @@ struct llm_tokenizer_bpe {
15463
15600
  "\\p{N}",
15464
15601
  };
15465
15602
  break;
15603
+ case LLAMA_VOCAB_PRE_TYPE_TEKKEN:
15604
+ // original regex from tokenizer.json
15605
+ // "[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]*[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]+|[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]+[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]*|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+"
15606
+ regex_exprs = {
15607
+ "[^\\r\\n\\p{L}\\p{N}]?((?=[\\p{L}])([^a-z]))*((?=[\\p{L}])([^A-Z]))+|[^\\r\\n\\p{L}\\p{N}]?((?=[\\p{L}])([^a-z]))+((?=[\\p{L}])([^A-Z]))*|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
15608
+ };
15609
+ break;
15466
15610
  default:
15467
15611
  // default regex for BPE tokenization pre-processing
15468
15612
  regex_exprs = {
@@ -17964,10 +18108,10 @@ static lm_ggml_type llama_tensor_get_type(quantize_state_internal & qs, lm_ggml_
17964
18108
  // if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_S) new_type = LM_GGML_TYPE_Q4_K;
17965
18109
  //}
17966
18110
  bool convert_incompatible_tensor = false;
17967
- if (new_type == LM_GGML_TYPE_Q2_K || new_type == LM_GGML_TYPE_Q3_K || new_type == LM_GGML_TYPE_Q4_K ||
17968
- new_type == LM_GGML_TYPE_Q5_K || new_type == LM_GGML_TYPE_Q6_K || new_type == LM_GGML_TYPE_IQ4_XS ||
17969
- new_type == LM_GGML_TYPE_IQ2_XS || new_type == LM_GGML_TYPE_IQ2_XXS || new_type == LM_GGML_TYPE_IQ2_S ||
17970
- new_type == LM_GGML_TYPE_IQ3_XXS || new_type == LM_GGML_TYPE_IQ1_S || new_type == LM_GGML_TYPE_IQ3_S ||
18111
+ if (new_type == LM_GGML_TYPE_Q2_K || new_type == LM_GGML_TYPE_Q3_K || new_type == LM_GGML_TYPE_Q4_K ||
18112
+ new_type == LM_GGML_TYPE_Q5_K || new_type == LM_GGML_TYPE_Q6_K || new_type == LM_GGML_TYPE_IQ4_XS ||
18113
+ new_type == LM_GGML_TYPE_IQ2_XS || new_type == LM_GGML_TYPE_IQ2_XXS || new_type == LM_GGML_TYPE_IQ2_S ||
18114
+ new_type == LM_GGML_TYPE_IQ3_XXS || new_type == LM_GGML_TYPE_IQ1_S || new_type == LM_GGML_TYPE_IQ3_S ||
17971
18115
  new_type == LM_GGML_TYPE_IQ1_M) {
17972
18116
  int nx = tensor->ne[0];
17973
18117
  int ny = tensor->ne[1];
@@ -18153,8 +18297,9 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
18153
18297
 
18154
18298
  // copy the KV pairs from the input file
18155
18299
  lm_gguf_set_kv (ctx_out, ml.meta);
18156
- lm_gguf_set_val_u32(ctx_out, "general.quantization_version", LM_GGML_QNT_VERSION);
18157
- lm_gguf_set_val_u32(ctx_out, "general.file_type", ftype);
18300
+ lm_gguf_set_val_u32(ctx_out, "general.quantization_version", LM_GGML_QNT_VERSION); // TODO: use LLM_KV
18301
+ lm_gguf_set_val_u32(ctx_out, "general.file_type", ftype); // TODO: use LLM_KV
18302
+
18158
18303
  // Remove split metadata
18159
18304
  lm_gguf_remove_key(ctx_out, ml.llm_kv(LLM_KV_SPLIT_NO).c_str());
18160
18305
  lm_gguf_remove_key(ctx_out, ml.llm_kv(LLM_KV_SPLIT_COUNT).c_str());
@@ -18469,282 +18614,210 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
18469
18614
  }
18470
18615
  }
18471
18616
 
18472
- static int llama_apply_lora_from_file_internal(
18473
- const struct llama_model & model, const char * path_lora, float scale, const char * path_base_model, int n_threads
18474
- ) {
18475
- LLAMA_LOG_INFO("%s: applying lora adapter from '%s' - please wait ...\n", __func__, path_lora);
18476
-
18477
- const int64_t t_start_lora_us = lm_ggml_time_us();
18478
-
18479
- llama_file fin(path_lora, "rb");
18480
-
18481
- // verify magic and version
18482
- {
18483
- uint32_t magic = fin.read_u32();
18484
- if (magic != LLAMA_FILE_MAGIC_GGLA) {
18485
- LLAMA_LOG_ERROR("%s: bad file magic\n", __func__);
18486
- return 1;
18487
- }
18617
+ static void llama_lora_adapter_init_internal(struct llama_model * model, const char * path_lora, struct llama_lora_adapter & adapter) {
18618
+ LLAMA_LOG_INFO("%s: loading lora adapter from '%s' ...\n", __func__, path_lora);
18488
18619
 
18489
- uint32_t format_version = fin.read_u32();
18490
- if (format_version != 1) {
18491
- LLAMA_LOG_ERROR("%s: unsupported file version\n", __func__ );
18492
- return 1;
18493
- }
18494
- }
18495
-
18496
- int32_t lora_r = fin.read_u32();
18497
- int32_t lora_alpha = fin.read_u32();
18498
- float scaling = scale * (float)lora_alpha / (float)lora_r;
18499
-
18500
- LLAMA_LOG_INFO("%s: r = %d, alpha = %d, scaling = %.2f\n", __func__, lora_r, lora_alpha, scaling);
18501
-
18502
- // load base model
18503
- std::unique_ptr<llama_model_loader> ml;
18504
- if (path_base_model) {
18505
- LLAMA_LOG_INFO("%s: loading base model from '%s'\n", __func__, path_base_model);
18506
- ml.reset(new llama_model_loader(path_base_model, /*use_mmap*/ true, /*check_tensors*/ false, /*kv_overrides*/ nullptr));
18507
- ml->init_mappings(/*prefetch*/ false); // no prefetching
18508
- }
18509
-
18510
- struct tensor_meta {
18511
- std::string name;
18512
- lm_ggml_type type;
18513
- int32_t ne[2];
18514
- size_t offset;
18620
+ lm_ggml_context * ctx = nullptr;
18621
+ struct lm_gguf_init_params meta_lm_gguf_params = {
18622
+ /* .no_alloc = */ true,
18623
+ /* .ctx = */ &ctx,
18515
18624
  };
18516
- std::map<std::string, tensor_meta> tensor_meta_map;
18517
-
18518
- // load all tensor meta
18519
- while (true) {
18520
- if (fin.tell() == fin.size) {
18521
- // eof
18522
- break;
18523
- }
18524
-
18525
- int32_t n_dims;
18526
- int32_t name_len;
18527
- int32_t ftype;
18528
-
18529
- fin.read_raw(&n_dims, sizeof(n_dims));
18530
- fin.read_raw(&name_len, sizeof(name_len));
18531
- fin.read_raw(&ftype, sizeof(ftype));
18532
-
18533
- if (n_dims != 1 && n_dims != 2) {
18534
- LLAMA_LOG_ERROR("%s: unsupported tensor dimension %d\n", __func__, n_dims);
18535
- return 1;
18536
- }
18625
+ struct lm_gguf_context * ctx_gguf = lm_gguf_init_from_file(path_lora, meta_lm_gguf_params);
18626
+ if (!ctx_gguf) {
18627
+ throw std::runtime_error("failed to load lora adapter file from " + std::string(path_lora));
18628
+ }
18537
18629
 
18538
- int32_t ne[2] = { 1, 1 };
18539
- for (int i = 0; i < n_dims; ++i) {
18540
- fin.read_raw(&ne[i], sizeof(ne[i]));
18541
- }
18630
+ // check metadata
18631
+ {
18632
+ auto get_kv_str = [&](const std::string & key) -> std::string {
18633
+ int id = lm_gguf_find_key(ctx_gguf, key.c_str());
18634
+ return id < 0 ? "" : std::string(lm_gguf_get_val_str(ctx_gguf, id));
18635
+ };
18636
+ auto get_kv_f32 = [&](const std::string & key) -> float {
18637
+ int id = lm_gguf_find_key(ctx_gguf, key.c_str());
18638
+ return id < 0 ? 0.0f : lm_gguf_get_val_f32(ctx_gguf, id);
18639
+ };
18640
+ LLM_KV llm_kv = LLM_KV(LLM_ARCH_UNKNOWN);
18542
18641
 
18543
- std::string name;
18544
- {
18545
- LM_GGML_ASSERT(name_len < LM_GGML_MAX_NAME);
18546
- char buf[LM_GGML_MAX_NAME];
18547
- fin.read_raw(buf, name_len);
18548
- name = std::string(buf, name_len);
18642
+ auto general_type = get_kv_str(llm_kv(LLM_KV_GENERAL_TYPE));
18643
+ if (general_type != "adapter") {
18644
+ lm_gguf_free(ctx_gguf);
18645
+ throw std::runtime_error("expect general.type to be 'adapter', but got: " + general_type);
18549
18646
  }
18550
18647
 
18551
- // check for lora suffix
18552
- std::string lora_suffix;
18553
- if (name.length() > 6) {
18554
- lora_suffix = name.substr(name.length() - 6);
18555
- }
18556
- if (lora_suffix != ".loraA" && lora_suffix != ".loraB") {
18557
- LLAMA_LOG_ERROR("%s: error: '%s' is not a lora tensor\n", __func__, name.c_str());
18558
- return 1;
18648
+ auto general_arch_str = get_kv_str(llm_kv(LLM_KV_GENERAL_ARCHITECTURE));
18649
+ auto general_arch = llm_arch_from_string(general_arch_str);
18650
+ if (general_arch != model->arch) {
18651
+ lm_gguf_free(ctx_gguf);
18652
+ throw std::runtime_error("model arch and LoRA arch mismatch");
18559
18653
  }
18560
18654
 
18561
- // tensor type
18562
- lm_ggml_type wtype;
18563
- switch (ftype) {
18564
- case 0: wtype = LM_GGML_TYPE_F32; break;
18565
- case 1: wtype = LM_GGML_TYPE_F16; break;
18566
- default:
18567
- {
18568
- LLAMA_LOG_ERROR("%s: invalid tensor data type '%d'\n",
18569
- __func__, ftype);
18570
- return 1;
18571
- }
18655
+ auto adapter_type = get_kv_str(llm_kv(LLM_KV_ADAPTER_TYPE));
18656
+ if (adapter_type != "lora") {
18657
+ lm_gguf_free(ctx_gguf);
18658
+ throw std::runtime_error("expect adapter.type to be 'lora', but got: " + adapter_type);
18572
18659
  }
18573
18660
 
18574
- // data offset
18575
- size_t offset = fin.tell();
18576
- offset = (offset + 31) & -32;
18577
-
18578
- // skip tensor data
18579
- fin.seek(offset + lm_ggml_row_size(wtype, ne[0]) * ne[1], SEEK_SET);
18580
-
18581
- tensor_meta_map.emplace(name, tensor_meta{ name, wtype, { ne[0], ne[1] }, offset });
18661
+ adapter.alpha = get_kv_f32(llm_kv(LLM_KV_ADAPTER_LORA_ALPHA));
18582
18662
  }
18583
18663
 
18584
- bool warned = false;
18585
- int n_tensors = 0;
18586
-
18587
- // apply
18588
- lm_ggml_backend_t backend_cpu = lm_ggml_backend_cpu_init();
18589
- if (backend_cpu == nullptr) {
18590
- LLAMA_LOG_ERROR("%s: error: failed to initialize cpu backend\n", __func__);
18591
- return 1;
18592
- }
18593
- lm_ggml_backend_cpu_set_n_threads(backend_cpu, n_threads);
18594
-
18595
- std::vector<no_init<uint8_t>> read_buf;
18596
- for (const auto & it : model.tensors_by_name) {
18597
- const std::string & base_name = it.first;
18598
- lm_ggml_tensor * model_t = it.second;
18599
-
18600
- if (tensor_meta_map.find(base_name + ".loraA") == tensor_meta_map.end() ||
18601
- tensor_meta_map.find(base_name + ".loraB") == tensor_meta_map.end()) {
18602
- continue;
18603
- }
18604
-
18605
- tensor_meta & metaA = tensor_meta_map.at(base_name + ".loraA");
18606
- tensor_meta & metaB = tensor_meta_map.at(base_name + ".loraB");
18664
+ int n_tensors = lm_gguf_get_n_tensors(ctx_gguf);
18607
18665
 
18608
- lm_ggml_init_params lora_init_params = {
18609
- /* .mem_size */ lm_ggml_tensor_overhead()*128 + lm_ggml_graph_overhead(),
18610
- /* .mem_buffer */ nullptr,
18611
- /* .no_alloc */ true,
18666
+ // contexts for each buffer type
18667
+ std::map<lm_ggml_backend_buffer_type_t, lm_ggml_context *> ctx_map;
18668
+ auto get_ctx_for_buft = [&](lm_ggml_backend_buffer_type_t buft) -> lm_ggml_context * {
18669
+ auto it = ctx_map.find(buft);
18670
+ if (it == ctx_map.end()) {
18671
+ // add a new context
18672
+ struct lm_ggml_init_params params = {
18673
+ /*.mem_size =*/ n_tensors*lm_ggml_tensor_overhead(),
18674
+ /*.mem_buffer =*/ NULL,
18675
+ /*.no_alloc =*/ true,
18676
+ };
18677
+ lm_ggml_context * buft_ctx = lm_ggml_init(params);
18678
+ ctx_map[buft] = buft_ctx;
18679
+ return buft_ctx;
18612
18680
  };
18613
- lm_ggml_context * lora_ctx = lm_ggml_init(lora_init_params);
18614
- if (lora_ctx == nullptr) {
18615
- LLAMA_LOG_ERROR("%s: error: failed to initialize lora context\n", __func__);
18616
- lm_ggml_backend_free(backend_cpu);
18617
- return 1;
18618
- }
18619
-
18620
- // create tensors
18621
- lm_ggml_tensor * loraA = lm_ggml_new_tensor_2d(lora_ctx, metaA.type, metaA.ne[0], metaA.ne[1]);
18622
- lm_ggml_tensor * loraB = lm_ggml_new_tensor_2d(lora_ctx, metaB.type, metaB.ne[0], metaB.ne[1]);
18623
- lm_ggml_set_name(loraA, metaA.name.c_str());
18624
- lm_ggml_set_name(loraB, metaB.name.c_str());
18681
+ return it->second;
18682
+ };
18625
18683
 
18626
- lm_ggml_tensor * base_t;
18627
- if (ml) {
18628
- if (!ml->get_tensor_meta(base_name.c_str())) {
18629
- LLAMA_LOG_ERROR("%s: error: tensor '%s' not found in base model\n", __func__, base_name.c_str());
18630
- return 1;
18684
+ // bundle lora_a and lora_b into pairs
18685
+ std::map<std::string, llama_lora_weight> ab_map;
18686
+ auto str_endswith = [](const std::string & str, const std::string & suffix) {
18687
+ return str.size() >= suffix.size() && str.compare(str.size()-suffix.size(), suffix.size(), suffix) == 0;
18688
+ };
18689
+ for (lm_ggml_tensor * cur = lm_ggml_get_first_tensor(ctx); cur; cur = lm_ggml_get_next_tensor(ctx, cur)) {
18690
+ std::string name(cur->name);
18691
+ if (str_endswith(name, ".lora_a")) {
18692
+ replace_all(name, ".lora_a", "");
18693
+ if (ab_map.find(name) == ab_map.end()) {
18694
+ ab_map[name] = llama_lora_weight(cur, nullptr);
18695
+ } else {
18696
+ ab_map[name].a = cur;
18697
+ }
18698
+ } else if (str_endswith(name, ".lora_b")) {
18699
+ replace_all(name, ".lora_b", "");
18700
+ if (ab_map.find(name) == ab_map.end()) {
18701
+ ab_map[name] = llama_lora_weight(nullptr, cur);
18702
+ } else {
18703
+ ab_map[name].b = cur;
18631
18704
  }
18632
- base_t = lm_ggml_dup_tensor(lora_ctx, ml->get_tensor_meta(base_name.c_str()));
18633
18705
  } else {
18634
- base_t = lm_ggml_dup_tensor(lora_ctx, model_t);
18635
- }
18636
- lm_ggml_set_name(base_t, base_name.c_str());
18637
-
18638
- // allocate in backend buffer
18639
- lm_ggml_backend_buffer_t lora_buf = lm_ggml_backend_alloc_ctx_tensors_from_buft(lora_ctx, lm_ggml_backend_cpu_buffer_type());
18640
- if (lora_buf == nullptr) {
18641
- LLAMA_LOG_ERROR("%s: error: failed to allocate lora tensors\n", __func__);
18642
- return 1;
18706
+ lm_gguf_free(ctx_gguf);
18707
+ lm_ggml_free(ctx);
18708
+ throw std::runtime_error("LoRA tensor '" + name + "' has unexpected suffix");
18643
18709
  }
18710
+ }
18644
18711
 
18645
- // load tensor data
18646
- auto load_tensor = [&read_buf, &fin](const tensor_meta & tensor_meta, lm_ggml_tensor * tensor) {
18647
- read_buf.resize(lm_ggml_nbytes(tensor));
18648
- fin.seek(tensor_meta.offset, SEEK_SET);
18649
- fin.read_raw(read_buf.data(), lm_ggml_nbytes(tensor));
18650
- lm_ggml_backend_tensor_set(tensor, read_buf.data(), 0, read_buf.size());
18651
- };
18652
- load_tensor(metaA, loraA);
18653
- load_tensor(metaB, loraB);
18712
+ // add tensors
18713
+ for (auto & it : ab_map) {
18714
+ const std::string & name = it.first;
18715
+ llama_lora_weight & w = it.second;
18654
18716
 
18655
- // load base model tensor data
18656
- if (ml) {
18657
- ml->load_data_for(base_t);
18658
- } else {
18659
- lm_ggml_backend_tensor_copy(model_t, base_t);
18717
+ if (!w.a || !w.b) {
18718
+ lm_gguf_free(ctx_gguf);
18719
+ lm_ggml_free(ctx);
18720
+ throw std::runtime_error("LoRA tensor pair for '" + name + "' is missing one component");
18660
18721
  }
18661
18722
 
18662
- if (lm_ggml_is_quantized(base_t->type) && !warned) {
18663
- LLAMA_LOG_WARN("%s: warning: using a lora adapter with a quantized model may result in poor quality, "
18664
- "use a f16 or f32 base model with --lora-base\n", __func__);
18665
- warned = true;
18723
+ // device buft and device ctx
18724
+ auto * model_tensor = llama_get_model_tensor(model, name.c_str());
18725
+ if (!model_tensor) {
18726
+ lm_gguf_free(ctx_gguf);
18727
+ lm_ggml_free(ctx);
18728
+ throw std::runtime_error("LoRA tensor '" + name + "' does not exist in base model");
18666
18729
  }
18667
-
18668
- if (base_t->ne[0] != loraA->ne[1] || base_t->ne[1] != loraB->ne[1]) {
18669
- LLAMA_LOG_ERROR("%s: incompatible tensor dimensions (%" PRId64 " and %" PRId64 ");"
18670
- " are you sure that this adapter is for this model?\n", __func__, base_t->ne[0], loraA->ne[1]);
18671
- lm_ggml_free(lora_ctx);
18672
- lm_ggml_backend_buffer_free(lora_buf);
18673
- lm_ggml_backend_free(backend_cpu);
18674
- return 1;
18730
+ struct lm_ggml_context * dev_ctx = get_ctx_for_buft(lm_ggml_backend_buffer_get_type(model_tensor->buffer));
18731
+ // validate tensor shape
18732
+ if (model_tensor->ne[0] != w.a->ne[0] || model_tensor->ne[1] != w.b->ne[1]) {
18733
+ lm_gguf_free(ctx_gguf);
18734
+ lm_ggml_free(ctx);
18735
+ throw std::runtime_error("tensor '" + name + "' has incorrect shape");
18675
18736
  }
18737
+ if (w.a->ne[1] != w.b->ne[0]) {
18738
+ lm_gguf_free(ctx_gguf);
18739
+ lm_ggml_free(ctx);
18740
+ throw std::runtime_error("lora_a tensor is not transposed (hint: adapter from \"finetune\" example is no longer supported)");
18741
+ }
18742
+ // save tensor to adapter
18743
+ struct lm_ggml_tensor * tensor_a = lm_ggml_dup_tensor(dev_ctx, w.a);
18744
+ struct lm_ggml_tensor * tensor_b = lm_ggml_dup_tensor(dev_ctx, w.b);
18745
+ lm_ggml_set_name(tensor_a, w.a->name);
18746
+ lm_ggml_set_name(tensor_b, w.b->name);
18747
+ adapter.ab_map[name] = llama_lora_weight(tensor_a, tensor_b);
18748
+ }
18676
18749
 
18677
- auto build_lora_graph = [&]() {
18678
- // w = w + BA*s
18679
- lm_ggml_tensor * BA = lm_ggml_mul_mat(lora_ctx, loraA, loraB);
18680
- lm_ggml_set_name(BA, "BA");
18681
-
18682
- if (scaling != 1.0f) {
18683
- BA = lm_ggml_scale(lora_ctx, BA, scaling);
18684
- lm_ggml_set_name(BA, "BA_scaled");
18685
- }
18686
-
18687
- lm_ggml_tensor * r;
18688
- r = lm_ggml_add_inplace(lora_ctx, base_t, BA);
18689
- lm_ggml_set_name(r, "r_add");
18690
-
18691
- if (base_t->type != model_t->type) {
18692
- // convert the result to the model type
18693
- r = lm_ggml_cast(lora_ctx, r, model_t->type);
18694
- lm_ggml_set_name(r, "r_cast");
18750
+ // allocate tensors / buffers and zero
18751
+ {
18752
+ adapter.ctxs.reserve(ctx_map.size());
18753
+ adapter.bufs.reserve(ctx_map.size());
18754
+ for (auto it : ctx_map) {
18755
+ lm_ggml_backend_buffer_type_t buft = it.first;
18756
+ lm_ggml_context * ctx_dev = it.second;
18757
+ lm_ggml_backend_buffer_t buf = lm_ggml_backend_alloc_ctx_tensors_from_buft(ctx_dev, buft);
18758
+ if (!buf) {
18759
+ lm_gguf_free(ctx_gguf);
18760
+ lm_ggml_free(ctx);
18761
+ throw std::runtime_error("failed to allocate buffer for lora adapter\n");
18695
18762
  }
18763
+ LLAMA_LOG_INFO("%s: %10s LoRA buffer size = %8.2f MiB\n", __func__, lm_ggml_backend_buffer_name(buf), lm_ggml_backend_buffer_get_size(buf)/1024.0/1024.0);
18764
+ adapter.ctxs.push_back(ctx_dev);
18765
+ adapter.bufs.push_back(buf);
18766
+ }
18767
+ }
18696
18768
 
18697
- return r;
18769
+ // set tensor data
18770
+ {
18771
+ llama_file lm_gguf_file(path_lora, "rb");
18772
+ std::vector<uint8_t> read_buf;
18773
+ auto set_tensor = [&](struct lm_ggml_tensor * orig, struct lm_ggml_tensor * dev) {
18774
+ size_t offs = lm_gguf_get_data_offset(ctx_gguf) + lm_gguf_get_tensor_offset(ctx_gguf, lm_gguf_find_tensor(ctx_gguf, orig->name));
18775
+ size_t size = lm_ggml_nbytes(orig);
18776
+ read_buf.resize(size);
18777
+ lm_gguf_file.seek(offs, SEEK_SET);
18778
+ lm_gguf_file.read_raw(read_buf.data(), size);
18779
+ lm_ggml_backend_tensor_set(dev, read_buf.data(), 0, size);
18698
18780
  };
18699
-
18700
- lm_ggml_cgraph * gf = lm_ggml_new_graph(lora_ctx);
18701
- lm_ggml_tensor * r = build_lora_graph();
18702
- lm_ggml_build_forward_expand(gf, r);
18703
-
18704
- lm_ggml_backend_buffer_t graph_buf = lm_ggml_backend_alloc_ctx_tensors_from_buft(lora_ctx, lm_ggml_backend_cpu_buffer_type());
18705
- if (graph_buf == nullptr) {
18706
- LLAMA_LOG_ERROR("%s: error: failed to allocate graph tensors\n", __func__);
18707
- lm_ggml_free(lora_ctx);
18708
- lm_ggml_backend_buffer_free(lora_buf);
18709
- lm_ggml_backend_free(backend_cpu);
18710
- return 1;
18781
+ for (auto & it : adapter.ab_map) {
18782
+ auto orig = ab_map[it.first];
18783
+ auto dev = it.second;
18784
+ set_tensor(orig.a, dev.a);
18785
+ set_tensor(orig.b, dev.b);
18711
18786
  }
18787
+ }
18712
18788
 
18713
- lm_ggml_backend_graph_compute(backend_cpu, gf);
18714
-
18715
- lm_ggml_backend_tensor_set(model_t, r->data, 0, lm_ggml_nbytes(r));
18716
-
18717
- #if 0
18718
- // TODO: use scheduler with fallback to CPU for less copies between CPU and GPU
18719
- //lm_ggml_backend_sched_t sched = lm_ggml_backend_sched_new(backends.data(), backends.size(), LM_GGML_DEFAULT_GRAPH_SIZE);
18720
-
18721
- // sched compute
18722
- lm_ggml_build_forward_expand(gf, build_graph());
18723
- lm_ggml_backend_sched_init_measure(sched, gf);
18724
-
18725
- // create the graph again, since the previous one was destroyed by the measure
18726
- lm_ggml_graph_clear(gf);
18727
- lm_ggml_build_forward_expand(gf, build_graph());
18728
- lm_ggml_backend_sched_graph_compute(sched, gf);
18729
- lm_ggml_backend_sched_free(sched);
18730
- #endif
18789
+ LLAMA_LOG_INFO("%s: loaded %ld tensors from lora file\n", __func__, adapter.ab_map.size()*2);
18731
18790
 
18732
- lm_ggml_backend_buffer_free(lora_buf);
18733
- lm_ggml_backend_buffer_free(graph_buf);
18734
- lm_ggml_free(lora_ctx);
18791
+ // free ctx for reading gguf
18792
+ lm_gguf_free(ctx_gguf);
18793
+ lm_ggml_free(ctx);
18794
+ }
18735
18795
 
18736
- n_tensors++;
18737
- if (n_tensors % 4 == 0) {
18738
- LLAMA_LOG_INFO(".");
18739
- }
18796
+ int32_t llama_lora_adapter_set(
18797
+ struct llama_context * ctx,
18798
+ struct llama_lora_adapter * adapter,
18799
+ float scale) {
18800
+ if (ctx->cparams.flash_attn) {
18801
+ LLAMA_LOG_ERROR("%s: flash_attn is not compatible with LoRA\n", __func__);
18802
+ return -1;
18740
18803
  }
18804
+ ctx->lora_adapters[adapter] = scale;
18805
+ return 0;
18806
+ }
18741
18807
 
18742
- lm_ggml_backend_free(backend_cpu);
18743
-
18744
- const int64_t t_lora_us = lm_ggml_time_us() - t_start_lora_us;
18745
- LLAMA_LOG_INFO(" done (%.2f ms)\n", t_lora_us / 1000.0);
18808
+ int32_t llama_lora_adapter_remove(
18809
+ struct llama_context * ctx,
18810
+ struct llama_lora_adapter * adapter) {
18811
+ auto pos = ctx->lora_adapters.find(adapter);
18812
+ if (pos != ctx->lora_adapters.end()) {
18813
+ ctx->lora_adapters.erase(pos);
18814
+ return 0;
18815
+ }
18816
+ return -1;
18817
+ }
18746
18818
 
18747
- return 0;
18819
+ void llama_lora_adapter_free(struct llama_lora_adapter * adapter) {
18820
+ delete adapter;
18748
18821
  }
18749
18822
 
18750
18823
  //
@@ -18838,6 +18911,8 @@ size_t llama_max_devices(void) {
18838
18911
  return LM_GGML_SYCL_MAX_DEVICES;
18839
18912
  #elif defined(LM_GGML_USE_VULKAN)
18840
18913
  return LM_GGML_VK_MAX_DEVICES;
18914
+ #elif defined(LM_GGML_USE_CANN)
18915
+ return LM_GGML_CANN_MAX_DEVICES;
18841
18916
  #else
18842
18917
  return 1;
18843
18918
  #endif
@@ -19179,6 +19254,30 @@ struct llama_context * llama_new_context_with_model(
19179
19254
  }
19180
19255
  ctx->backends.push_back(backend);
19181
19256
  }
19257
+ #elif defined(LM_GGML_USE_CANN)
19258
+ // with split_mode LLAMA_SPLIT_MODE_NONE or LLAMA_SPLIT_MODE_ROW, only the main GPU backend is used
19259
+ // TODO: lm_ggml_backend_cann is not support split tensor now, just leave code here.
19260
+ if (model->split_mode == LLAMA_SPLIT_MODE_NONE || model->split_mode == LLAMA_SPLIT_MODE_ROW) {
19261
+ lm_ggml_backend_t backend = lm_ggml_backend_cann_init(model->main_gpu);
19262
+ if (backend == nullptr) {
19263
+ LLAMA_LOG_ERROR("%s: failed to initialize CANN%d backend\n", __func__, model->main_gpu);
19264
+ llama_free(ctx);
19265
+ return nullptr;
19266
+ }
19267
+ ctx->backends.push_back(backend);
19268
+ } else {
19269
+ // LLAMA_SPLIT_MODE_LAYER requires a backend for each GPU
19270
+ // TODO: currently, CANN can't use multi-gpus, just leave code here for further cann version.
19271
+ for (int32_t device = 0; device < lm_ggml_backend_cann_get_device_count(); ++device) {
19272
+ lm_ggml_backend_t backend = lm_ggml_backend_cann_init(device);
19273
+ if (backend == nullptr) {
19274
+ LLAMA_LOG_ERROR("%s: failed to initialize CANN%d backend\n", __func__, device);
19275
+ llama_free(ctx);
19276
+ return nullptr;
19277
+ }
19278
+ ctx->backends.push_back(backend);
19279
+ }
19280
+ }
19182
19281
  #endif
19183
19282
 
19184
19283
  #ifdef LM_GGML_USE_BLAS
@@ -19363,7 +19462,6 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
19363
19462
  case LLM_ARCH_BAICHUAN:
19364
19463
  case LLM_ARCH_STARCODER:
19365
19464
  case LLM_ARCH_PLAMO:
19366
- case LLM_ARCH_CODESHELL:
19367
19465
  case LLM_ARCH_ORION:
19368
19466
  case LLM_ARCH_INTERNLM2:
19369
19467
  case LLM_ARCH_MINICPM:
@@ -19393,6 +19491,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
19393
19491
  case LLM_ARCH_STARCODER2:
19394
19492
  case LLM_ARCH_OPENELM:
19395
19493
  case LLM_ARCH_GPTNEOX:
19494
+ case LLM_ARCH_CODESHELL:
19396
19495
  return LLAMA_ROPE_TYPE_NEOX;
19397
19496
 
19398
19497
  // all model arches should be listed explicitly here
@@ -19525,12 +19624,14 @@ uint32_t llama_model_quantize(
19525
19624
  }
19526
19625
  }
19527
19626
 
19528
- int32_t llama_model_apply_lora_from_file(const struct llama_model * model, const char * path_lora, float scale, const char * path_base_model, int32_t n_threads) {
19627
+ struct llama_lora_adapter * llama_lora_adapter_init(struct llama_model * model, const char * path_lora) {
19529
19628
  try {
19530
- return llama_apply_lora_from_file_internal(*model, path_lora, scale, path_base_model, n_threads);
19629
+ struct llama_lora_adapter * adapter = new llama_lora_adapter(model);
19630
+ llama_lora_adapter_init_internal(model, path_lora, *adapter);
19631
+ return adapter;
19531
19632
  } catch (const std::exception & err) {
19532
19633
  LLAMA_LOG_ERROR("%s: failed to apply lora adapter: %s\n", __func__, err.what());
19533
- return 1;
19634
+ return nullptr;
19534
19635
  }
19535
19636
  }
19536
19637
 
@@ -19846,7 +19947,7 @@ size_t llama_state_get_size(const struct llama_context * ctx) {
19846
19947
  );
19847
19948
 
19848
19949
  // on session change it is very likely that the state size has changed - so we need to update this function
19849
- static_assert(LLAMA_SESSION_VERSION == 6, "So you just bumped the session version - good. But did you remember to update llama_state_get_size?");
19950
+ static_assert(LLAMA_SESSION_VERSION == 7, "So you just bumped the session version - good. But did you remember to update llama_state_get_size?");
19850
19951
 
19851
19952
  return s_total;
19852
19953
  }
@@ -21533,7 +21634,7 @@ static int32_t llama_chat_apply_template_internal(
21533
21634
  if (add_ass) {
21534
21635
  ss << "<|assistant|>";
21535
21636
  }
21536
- } else if (tmpl == "chaglm4" || tmpl_contains("[gMASK]<sop>")) {
21637
+ } else if (tmpl == "chatglm4" || tmpl_contains("[gMASK]<sop>")) {
21537
21638
  ss << "[gMASK]" << "<sop>";
21538
21639
  for (auto message : chat) {
21539
21640
  std::string role(message->role);
@@ -21754,6 +21855,8 @@ void llama_log_set(lm_ggml_log_callback log_callback, void * user_data) {
21754
21855
  lm_ggml_backend_metal_log_set_callback(g_state.log_callback, g_state.log_callback_user_data);
21755
21856
  #elif defined(LM_GGML_USE_CUDA)
21756
21857
  lm_ggml_backend_cuda_log_set_callback(g_state.log_callback, g_state.log_callback_user_data);
21858
+ #elif defined(LM_GGML_USE_CANN)
21859
+ lm_ggml_backend_cann_log_set_callback(g_state.log_callback, g_state.log_callback_user_data);
21757
21860
  #endif
21758
21861
  }
21759
21862