llama_cpp 0.10.3 → 0.10.4

Sign up to get free protection for your applications and to get access to all the features.
@@ -245,6 +245,8 @@ enum llm_kv {
245
245
  LLM_KV_ATTENTION_HEAD_COUNT_KV,
246
246
  LLM_KV_ATTENTION_MAX_ALIBI_BIAS,
247
247
  LLM_KV_ATTENTION_CLAMP_KQV,
248
+ LLM_KV_ATTENTION_KEY_LENGTH,
249
+ LLM_KV_ATTENTION_VALUE_LENGTH,
248
250
  LLM_KV_ATTENTION_LAYERNORM_EPS,
249
251
  LLM_KV_ATTENTION_LAYERNORM_RMS_EPS,
250
252
 
@@ -297,6 +299,8 @@ static std::map<llm_kv, std::string> LLM_KV_NAMES = {
297
299
  { LLM_KV_ATTENTION_HEAD_COUNT_KV, "%s.attention.head_count_kv" },
298
300
  { LLM_KV_ATTENTION_MAX_ALIBI_BIAS, "%s.attention.max_alibi_bias" },
299
301
  { LLM_KV_ATTENTION_CLAMP_KQV, "%s.attention.clamp_kqv" },
302
+ { LLM_KV_ATTENTION_KEY_LENGTH, "%s.attention.key_length" },
303
+ { LLM_KV_ATTENTION_VALUE_LENGTH, "%s.attention.value_length" },
300
304
  { LLM_KV_ATTENTION_LAYERNORM_EPS, "%s.attention.layer_norm_epsilon" },
301
305
  { LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, "%s.attention.layer_norm_rms_epsilon" },
302
306
 
@@ -1279,6 +1283,8 @@ struct llama_hparams {
1279
1283
  uint32_t n_head_kv;
1280
1284
  uint32_t n_layer;
1281
1285
  uint32_t n_rot;
1286
+ uint32_t n_embd_head_k; // dimension of keys (d_k). d_q is assumed to be the same, but there are n_head q heads, and only n_head_kv k-v heads
1287
+ uint32_t n_embd_head_v; // dimension of values (d_v) aka n_embd_head
1282
1288
  uint32_t n_ff;
1283
1289
  uint32_t n_expert = 0;
1284
1290
  uint32_t n_expert_used = 0;
@@ -1305,6 +1311,8 @@ struct llama_hparams {
1305
1311
  if (this->n_head_kv != other.n_head_kv) return true;
1306
1312
  if (this->n_layer != other.n_layer) return true;
1307
1313
  if (this->n_rot != other.n_rot) return true;
1314
+ if (this->n_embd_head_k != other.n_embd_head_k) return true;
1315
+ if (this->n_embd_head_v != other.n_embd_head_v) return true;
1308
1316
  if (this->n_ff != other.n_ff) return true;
1309
1317
  if (this->n_expert != other.n_expert) return true;
1310
1318
  if (this->n_expert_used != other.n_expert_used) return true;
@@ -1326,12 +1334,12 @@ struct llama_hparams {
1326
1334
  return n_head/n_head_kv;
1327
1335
  }
1328
1336
 
1329
- uint32_t n_embd_head() const {
1330
- return n_embd/n_head;
1337
+ uint32_t n_embd_k_gqa() const { // dimension of key embeddings across all k-v heads
1338
+ return n_embd_head_k * n_head_kv;
1331
1339
  }
1332
1340
 
1333
- uint32_t n_embd_gqa() const {
1334
- return n_embd/n_gqa();
1341
+ uint32_t n_embd_v_gqa() const { // dimension of value embeddings across all k-v heads
1342
+ return n_embd_head_v * n_head_kv;
1335
1343
  }
1336
1344
  };
1337
1345
 
@@ -1640,8 +1648,9 @@ static bool llama_kv_cache_init(
1640
1648
  uint32_t n_ctx,
1641
1649
  int n_gpu_layers,
1642
1650
  bool offload) {
1643
- const uint32_t n_embd = hparams.n_embd_gqa();
1644
- const uint32_t n_layer = hparams.n_layer;
1651
+ const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa();
1652
+ const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa();
1653
+ const uint32_t n_layer = hparams.n_layer;
1645
1654
 
1646
1655
  cache.has_shift = false;
1647
1656
 
@@ -1672,8 +1681,8 @@ static bool llama_kv_cache_init(
1672
1681
  const int i_gpu_start = (int) n_layer - n_gpu_layers;
1673
1682
 
1674
1683
  for (int i = 0; i < (int) n_layer; i++) {
1675
- ggml_tensor * k = ggml_new_tensor_1d(cache.ctx, ktype, n_embd*n_ctx);
1676
- ggml_tensor * v = ggml_new_tensor_1d(cache.ctx, vtype, n_embd*n_ctx);
1684
+ ggml_tensor * k = ggml_new_tensor_1d(cache.ctx, ktype, n_embd_k_gqa*n_ctx);
1685
+ ggml_tensor * v = ggml_new_tensor_1d(cache.ctx, vtype, n_embd_v_gqa*n_ctx);
1677
1686
  ggml_format_name(k, "cache_k_l%d", i);
1678
1687
  ggml_format_name(v, "cache_v_l%d", i);
1679
1688
  cache.k_l.push_back(k);
@@ -2667,6 +2676,12 @@ static void llm_load_hparams(
2667
2676
  // gpt-j n_rot = rotary_dim
2668
2677
  }
2669
2678
 
2679
+ hparams.n_embd_head_k = hparams.n_embd / hparams.n_head;
2680
+ ml.get_key(LLM_KV_ATTENTION_KEY_LENGTH, hparams.n_embd_head_k, false);
2681
+
2682
+ hparams.n_embd_head_v = hparams.n_embd / hparams.n_head;
2683
+ ml.get_key(LLM_KV_ATTENTION_VALUE_LENGTH, hparams.n_embd_head_v, false);
2684
+
2670
2685
  // arch-specific KVs
2671
2686
  switch (model.arch) {
2672
2687
  case LLM_ARCH_LLAMA:
@@ -3077,8 +3092,12 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
3077
3092
  LLAMA_LOG_INFO("%s: n_head = %u\n", __func__, hparams.n_head);
3078
3093
  LLAMA_LOG_INFO("%s: n_head_kv = %u\n", __func__, hparams.n_head_kv);
3079
3094
  LLAMA_LOG_INFO("%s: n_layer = %u\n", __func__, hparams.n_layer);
3080
- LLAMA_LOG_INFO("%s: n_rot = %u\n", __func__, hparams.n_rot); // a.k.a. n_embd_head, n_head_dim
3095
+ LLAMA_LOG_INFO("%s: n_rot = %u\n", __func__, hparams.n_rot);
3096
+ LLAMA_LOG_INFO("%s: n_embd_head_k = %u\n", __func__, hparams.n_embd_head_k);
3097
+ LLAMA_LOG_INFO("%s: n_embd_head_v = %u\n", __func__, hparams.n_embd_head_v);
3081
3098
  LLAMA_LOG_INFO("%s: n_gqa = %u\n", __func__, hparams.n_gqa());
3099
+ LLAMA_LOG_INFO("%s: n_embd_k_gqa = %u\n", __func__, hparams.n_embd_k_gqa());
3100
+ LLAMA_LOG_INFO("%s: n_embd_v_gqa = %u\n", __func__, hparams.n_embd_v_gqa());
3082
3101
  LLAMA_LOG_INFO("%s: f_norm_eps = %.1e\n", __func__, hparams.f_norm_eps);
3083
3102
  LLAMA_LOG_INFO("%s: f_norm_rms_eps = %.1e\n", __func__, hparams.f_norm_rms_eps);
3084
3103
  LLAMA_LOG_INFO("%s: f_clamp_kqv = %.1e\n", __func__, hparams.f_clamp_kqv);
@@ -3168,10 +3187,11 @@ static bool llm_load_tensors(
3168
3187
 
3169
3188
  // create tensors for the weights
3170
3189
  {
3171
- const int64_t n_embd = hparams.n_embd;
3172
- const int64_t n_embd_gqa = hparams.n_embd_gqa();
3173
- const int64_t n_layer = hparams.n_layer;
3174
- const int64_t n_vocab = hparams.n_vocab;
3190
+ const int64_t n_embd = hparams.n_embd;
3191
+ const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa();
3192
+ const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa();
3193
+ const int64_t n_layer = hparams.n_layer;
3194
+ const int64_t n_vocab = hparams.n_vocab;
3175
3195
 
3176
3196
  const auto tn = LLM_TN(model.arch);
3177
3197
  switch (model.arch) {
@@ -3197,7 +3217,10 @@ static bool llm_load_tensors(
3197
3217
  model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
3198
3218
  }
3199
3219
 
3200
- const uint32_t n_ff = hparams.n_ff;
3220
+ const uint32_t n_ff = hparams.n_ff;
3221
+ const int64_t n_embd_gqa = n_embd_v_gqa;
3222
+ GGML_ASSERT(n_embd_gqa == n_embd / hparams.n_gqa());
3223
+ GGML_ASSERT(n_embd_gqa == n_embd_k_gqa);
3201
3224
 
3202
3225
  const int i_gpu_start = n_layer - n_gpu_layers;
3203
3226
 
@@ -3265,7 +3288,10 @@ static bool llm_load_tensors(
3265
3288
  model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
3266
3289
  }
3267
3290
 
3268
- const uint32_t n_ff = hparams.n_ff;
3291
+ const uint32_t n_ff = hparams.n_ff;
3292
+ const int64_t n_embd_gqa = n_embd_v_gqa;
3293
+ GGML_ASSERT(n_embd_gqa == n_embd / hparams.n_gqa());
3294
+ GGML_ASSERT(n_embd_gqa == n_embd_k_gqa);
3269
3295
 
3270
3296
  const int i_gpu_start = n_layer - n_gpu_layers;
3271
3297
 
@@ -3313,7 +3339,10 @@ static bool llm_load_tensors(
3313
3339
  model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
3314
3340
  }
3315
3341
 
3316
- const uint32_t n_ff = hparams.n_ff;
3342
+ const uint32_t n_ff = hparams.n_ff;
3343
+ const int64_t n_embd_gqa = n_embd_v_gqa;
3344
+ GGML_ASSERT(n_embd_gqa == n_embd / hparams.n_gqa());
3345
+ GGML_ASSERT(n_embd_gqa == n_embd_k_gqa);
3317
3346
 
3318
3347
  const int i_gpu_start = n_layer - n_gpu_layers;
3319
3348
 
@@ -3363,7 +3392,10 @@ static bool llm_load_tensors(
3363
3392
  model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
3364
3393
  }
3365
3394
 
3366
- const uint32_t n_ff = hparams.n_ff;
3395
+ const uint32_t n_ff = hparams.n_ff;
3396
+ const int64_t n_embd_gqa = n_embd_v_gqa;
3397
+ GGML_ASSERT(n_embd_gqa == n_embd / hparams.n_gqa());
3398
+ GGML_ASSERT(n_embd_gqa == n_embd_k_gqa);
3367
3399
 
3368
3400
  const int i_gpu_start = n_layer - n_gpu_layers;
3369
3401
 
@@ -3415,7 +3447,11 @@ static bool llm_load_tensors(
3415
3447
  model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
3416
3448
  }
3417
3449
 
3418
- const uint32_t n_ff = hparams.n_ff;
3450
+ const uint32_t n_ff = hparams.n_ff;
3451
+ const int64_t n_embd_gqa = n_embd_v_gqa;
3452
+ GGML_ASSERT(n_embd_gqa == n_embd / hparams.n_gqa());
3453
+ GGML_ASSERT(n_embd_gqa == n_embd_k_gqa);
3454
+
3419
3455
  const int i_gpu_start = n_layer - n_gpu_layers;
3420
3456
  model.layers.resize(n_layer);
3421
3457
  for (uint32_t i = 0; i < n_layer; ++i) {
@@ -3464,7 +3500,10 @@ static bool llm_load_tensors(
3464
3500
  model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
3465
3501
  }
3466
3502
 
3467
- const uint32_t n_ff = hparams.n_ff;
3503
+ const uint32_t n_ff = hparams.n_ff;
3504
+ const int64_t n_embd_gqa = n_embd_v_gqa;
3505
+ GGML_ASSERT(n_embd_gqa == n_embd / hparams.n_gqa());
3506
+ GGML_ASSERT(n_embd_gqa == n_embd_k_gqa);
3468
3507
 
3469
3508
  const int i_gpu_start = n_layer - n_gpu_layers;
3470
3509
 
@@ -3515,7 +3554,10 @@ static bool llm_load_tensors(
3515
3554
  model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
3516
3555
  }
3517
3556
 
3518
- const uint32_t n_ff = hparams.n_ff;
3557
+ const uint32_t n_ff = hparams.n_ff;
3558
+ const int64_t n_embd_gqa = n_embd_v_gqa;
3559
+ GGML_ASSERT(n_embd_gqa == n_embd / hparams.n_gqa());
3560
+ GGML_ASSERT(n_embd_gqa == n_embd_k_gqa);
3519
3561
 
3520
3562
  const int i_gpu_start = n_layer - n_gpu_layers;
3521
3563
 
@@ -3562,7 +3604,10 @@ static bool llm_load_tensors(
3562
3604
  model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
3563
3605
  }
3564
3606
 
3565
- const uint32_t n_ff = hparams.n_ff;
3607
+ const uint32_t n_ff = hparams.n_ff;
3608
+ const int64_t n_embd_gqa = n_embd_v_gqa;
3609
+ GGML_ASSERT(n_embd_gqa == n_embd / hparams.n_gqa());
3610
+ GGML_ASSERT(n_embd_gqa == n_embd_k_gqa);
3566
3611
 
3567
3612
  const int i_gpu_start = n_layer - n_gpu_layers;
3568
3613
 
@@ -3660,7 +3705,10 @@ static bool llm_load_tensors(
3660
3705
  model.output_b = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "bias"), {n_vocab}, backend_output);
3661
3706
  }
3662
3707
 
3663
- const uint32_t n_ff = hparams.n_ff;
3708
+ const uint32_t n_ff = hparams.n_ff;
3709
+ const int64_t n_embd_gqa = n_embd_v_gqa;
3710
+ GGML_ASSERT(n_embd_gqa == n_embd / hparams.n_gqa());
3711
+ GGML_ASSERT(n_embd_gqa == n_embd_k_gqa);
3664
3712
 
3665
3713
  const int i_gpu_start = n_layer - n_gpu_layers;
3666
3714
 
@@ -3709,7 +3757,10 @@ static bool llm_load_tensors(
3709
3757
  model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
3710
3758
  }
3711
3759
 
3712
- const uint32_t n_ff = hparams.n_ff;
3760
+ const uint32_t n_ff = hparams.n_ff;
3761
+ const int64_t n_embd_gqa = n_embd_v_gqa;
3762
+ GGML_ASSERT(n_embd_gqa == n_embd / hparams.n_gqa());
3763
+ GGML_ASSERT(n_embd_gqa == n_embd_k_gqa);
3713
3764
 
3714
3765
  const int i_gpu_start = n_layer - n_gpu_layers;
3715
3766
 
@@ -3756,7 +3807,10 @@ static bool llm_load_tensors(
3756
3807
  model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
3757
3808
  }
3758
3809
 
3759
- const uint32_t n_ff = hparams.n_ff;
3810
+ const uint32_t n_ff = hparams.n_ff;
3811
+ const int64_t n_embd_gqa = n_embd_v_gqa;
3812
+ GGML_ASSERT(n_embd_gqa == n_embd / hparams.n_gqa());
3813
+ GGML_ASSERT(n_embd_gqa == n_embd_k_gqa);
3760
3814
 
3761
3815
  const int i_gpu_start = n_layer - n_gpu_layers;
3762
3816
 
@@ -3995,8 +4049,8 @@ static struct ggml_tensor * llm_build_inp_embd(
3995
4049
  return inpL;
3996
4050
  }
3997
4051
 
3998
- // Persimmon: n_rot = n_embd_head/2
3999
- // Other: n_rot = n_embd_head
4052
+ // Persimmon: n_rot = n_embd_head_k/2
4053
+ // Other: n_rot = n_embd_head_k
4000
4054
  static void llm_build_k_shift(
4001
4055
  struct ggml_context * ctx,
4002
4056
  const llama_hparams & hparams,
@@ -4009,17 +4063,17 @@ static void llm_build_k_shift(
4009
4063
  float freq_base,
4010
4064
  float freq_scale,
4011
4065
  const llm_build_cb & cb) {
4012
- const int64_t n_layer = hparams.n_layer;
4013
- const int64_t n_head_kv = hparams.n_head_kv;
4014
- const int64_t n_embd_gqa = hparams.n_embd_gqa();
4015
- const int64_t n_embd_head = hparams.n_embd_head();
4016
- const int32_t n_orig_ctx = cparams.n_yarn_orig_ctx;
4017
- const float ext_factor = cparams.yarn_ext_factor;
4018
- const float attn_factor = cparams.yarn_attn_factor;
4019
- const float beta_fast = cparams.yarn_beta_fast;
4020
- const float beta_slow = cparams.yarn_beta_slow;
4021
-
4022
- GGML_ASSERT(n_embd_head % n_rot == 0);
4066
+ const int64_t n_layer = hparams.n_layer;
4067
+ const int64_t n_head_kv = hparams.n_head_kv;
4068
+ const int64_t n_embd_head_k = hparams.n_embd_head_k;
4069
+ const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa();
4070
+ const int32_t n_orig_ctx = cparams.n_yarn_orig_ctx;
4071
+ const float ext_factor = cparams.yarn_ext_factor;
4072
+ const float attn_factor = cparams.yarn_attn_factor;
4073
+ const float beta_fast = cparams.yarn_beta_fast;
4074
+ const float beta_slow = cparams.yarn_beta_slow;
4075
+
4076
+ GGML_ASSERT(n_embd_head_k % n_rot == 0);
4023
4077
 
4024
4078
  struct ggml_tensor * K_shift = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, n_ctx);
4025
4079
  cb(K_shift, "K_shift", -1);
@@ -4037,9 +4091,9 @@ static void llm_build_k_shift(
4037
4091
  // we rotate only the first n_rot dimensions
4038
4092
  ggml_rope_custom_inplace(ctx,
4039
4093
  ggml_view_3d(ctx, kv.k_l[il],
4040
- n_embd_head, n_head_kv, n_ctx,
4041
- ggml_row_size(kv.k_l[il]->type, n_embd_head),
4042
- ggml_row_size(kv.k_l[il]->type, n_embd_gqa),
4094
+ n_embd_head_k, n_head_kv, n_ctx,
4095
+ ggml_row_size(kv.k_l[il]->type, n_embd_head_k),
4096
+ ggml_row_size(kv.k_l[il]->type, n_embd_k_gqa),
4043
4097
  0),
4044
4098
  K_shift, n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
4045
4099
  ext_factor, attn_factor, beta_fast, beta_slow);
@@ -4060,18 +4114,19 @@ static void llm_build_kv_store(
4060
4114
  int32_t kv_head,
4061
4115
  const llm_build_cb & cb,
4062
4116
  int64_t il) {
4063
- const int64_t n_embd_gqa = hparams.n_embd_gqa();
4117
+ const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa();
4118
+ const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa();
4064
4119
 
4065
4120
  // compute the transposed [n_tokens, n_embd] V matrix
4066
- struct ggml_tensor * v_cur_t = ggml_transpose(ctx, ggml_reshape_2d(ctx, v_cur, n_embd_gqa, n_tokens));
4121
+ struct ggml_tensor * v_cur_t = ggml_transpose(ctx, ggml_reshape_2d(ctx, v_cur, n_embd_v_gqa, n_tokens));
4067
4122
  //struct ggml_tensor * v_cur_t = ggml_transpose(ctx, v_cur); // TODO: reshape above is likely not needed
4068
4123
  cb(v_cur_t, "v_cur_t", il);
4069
4124
 
4070
- struct ggml_tensor * k_cache_view = ggml_view_1d(ctx, kv.k_l[il], n_tokens*n_embd_gqa,
4071
- (ggml_row_size(kv.k_l[il]->type, n_embd_gqa))*kv_head);
4125
+ struct ggml_tensor * k_cache_view = ggml_view_1d(ctx, kv.k_l[il], n_tokens*n_embd_k_gqa,
4126
+ (ggml_row_size(kv.k_l[il]->type, n_embd_k_gqa))*kv_head);
4072
4127
  cb(k_cache_view, "k_cache_view", il);
4073
4128
 
4074
- struct ggml_tensor * v_cache_view = ggml_view_2d(ctx, kv.v_l[il], n_tokens, n_embd_gqa,
4129
+ struct ggml_tensor * v_cache_view = ggml_view_2d(ctx, kv.v_l[il], n_tokens, n_embd_v_gqa,
4075
4130
  ( n_ctx)*ggml_element_size(kv.v_l[il]),
4076
4131
  (kv_head)*ggml_element_size(kv.v_l[il]));
4077
4132
  cb(v_cache_view, "v_cache_view", il);
@@ -4221,20 +4276,20 @@ static struct ggml_tensor * llm_build_kqv(
4221
4276
  float kq_scale,
4222
4277
  const llm_build_cb & cb,
4223
4278
  int il) {
4224
- const int64_t n_embd = hparams.n_embd;
4225
- const int64_t n_head = hparams.n_head;
4226
- const int64_t n_head_kv = hparams.n_head_kv;
4227
- const int64_t n_embd_head = hparams.n_embd_head();
4228
- const int64_t n_embd_gqa = hparams.n_embd_gqa();
4279
+ const int64_t n_head = hparams.n_head;
4280
+ const int64_t n_head_kv = hparams.n_head_kv;
4281
+ const int64_t n_embd_head_k = hparams.n_embd_head_k;
4282
+ const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa();
4283
+ const int64_t n_embd_head_v = hparams.n_embd_head_v;
4229
4284
 
4230
4285
  struct ggml_tensor * q = ggml_permute(ctx, q_cur, 0, 2, 1, 3);
4231
4286
  cb(q, "q", il);
4232
4287
 
4233
4288
  struct ggml_tensor * k =
4234
4289
  ggml_view_3d(ctx, kv.k_l[il],
4235
- n_embd_head, n_kv, n_head_kv,
4236
- ggml_row_size(kv.k_l[il]->type, n_embd_gqa),
4237
- ggml_row_size(kv.k_l[il]->type, n_embd_head),
4290
+ n_embd_head_k, n_kv, n_head_kv,
4291
+ ggml_row_size(kv.k_l[il]->type, n_embd_k_gqa),
4292
+ ggml_row_size(kv.k_l[il]->type, n_embd_head_k),
4238
4293
  0);
4239
4294
  cb(k, "k", il);
4240
4295
 
@@ -4273,9 +4328,9 @@ static struct ggml_tensor * llm_build_kqv(
4273
4328
  // split cached v into n_head heads
4274
4329
  struct ggml_tensor * v =
4275
4330
  ggml_view_3d(ctx, kv.v_l[il],
4276
- n_kv, n_embd_head, n_head_kv,
4331
+ n_kv, n_embd_head_v, n_head_kv,
4277
4332
  ggml_element_size(kv.v_l[il])*n_ctx,
4278
- ggml_element_size(kv.v_l[il])*n_ctx*n_embd_head,
4333
+ ggml_element_size(kv.v_l[il])*n_ctx*n_embd_head_v,
4279
4334
  0);
4280
4335
  cb(v, "v", il);
4281
4336
 
@@ -4285,7 +4340,7 @@ static struct ggml_tensor * llm_build_kqv(
4285
4340
  struct ggml_tensor * kqv_merged = ggml_permute(ctx, kqv, 0, 2, 1, 3);
4286
4341
  cb(kqv_merged, "kqv_merged", il);
4287
4342
 
4288
- struct ggml_tensor * cur = ggml_cont_2d(ctx, kqv_merged, n_embd, n_tokens);
4343
+ struct ggml_tensor * cur = ggml_cont_2d(ctx, kqv_merged, n_embd_head_k*n_head, n_tokens);
4289
4344
  cb(cur, "kqv_merged_cont", il);
4290
4345
 
4291
4346
  cur = ggml_mul_mat(ctx, wo, cur);
@@ -4312,8 +4367,10 @@ struct llm_build_context {
4312
4367
  const int64_t n_ctx; // user-specified context size (can be different from n_ctx_train)
4313
4368
  const int64_t n_head;
4314
4369
  const int64_t n_head_kv;
4315
- const int64_t n_embd_head;
4316
- const int64_t n_embd_gqa;
4370
+ const int64_t n_embd_head_k;
4371
+ const int64_t n_embd_k_gqa;
4372
+ const int64_t n_embd_head_v;
4373
+ const int64_t n_embd_v_gqa;
4317
4374
  const int64_t n_expert;
4318
4375
  const int64_t n_expert_used;
4319
4376
 
@@ -4355,8 +4412,10 @@ struct llm_build_context {
4355
4412
  n_ctx (cparams.n_ctx),
4356
4413
  n_head (hparams.n_head),
4357
4414
  n_head_kv (hparams.n_head_kv),
4358
- n_embd_head (hparams.n_embd_head()),
4359
- n_embd_gqa (hparams.n_embd_gqa()),
4415
+ n_embd_head_k (hparams.n_embd_head_k),
4416
+ n_embd_k_gqa (hparams.n_embd_k_gqa()),
4417
+ n_embd_head_v (hparams.n_embd_head_v),
4418
+ n_embd_v_gqa (hparams.n_embd_v_gqa()),
4360
4419
  n_expert (hparams.n_expert),
4361
4420
  n_expert_used (hparams.n_expert_used),
4362
4421
  freq_base (cparams.rope_freq_base),
@@ -4399,6 +4458,8 @@ struct llm_build_context {
4399
4458
  struct ggml_cgraph * build_llama() {
4400
4459
  struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
4401
4460
 
4461
+ const int64_t n_embd_head = hparams.n_embd_head_v;
4462
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
4402
4463
  GGML_ASSERT(n_embd_head == hparams.n_rot);
4403
4464
 
4404
4465
  struct ggml_tensor * cur;
@@ -4583,6 +4644,9 @@ struct llm_build_context {
4583
4644
  struct ggml_cgraph * build_baichuan() {
4584
4645
  struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
4585
4646
 
4647
+ const int64_t n_embd_head = hparams.n_embd_head_v;
4648
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
4649
+
4586
4650
  struct ggml_tensor * cur;
4587
4651
  struct ggml_tensor * inpL;
4588
4652
 
@@ -4700,6 +4764,11 @@ struct llm_build_context {
4700
4764
  struct ggml_cgraph * build_falcon() {
4701
4765
  struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
4702
4766
 
4767
+ const int64_t n_embd_head = hparams.n_embd_head_v;
4768
+ const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
4769
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
4770
+ GGML_ASSERT(n_embd_gqa == n_embd);
4771
+
4703
4772
  struct ggml_tensor * cur;
4704
4773
  struct ggml_tensor * inpL;
4705
4774
 
@@ -4819,6 +4888,11 @@ struct llm_build_context {
4819
4888
  struct ggml_cgraph * build_starcoder() {
4820
4889
  struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
4821
4890
 
4891
+ const int64_t n_embd_head = hparams.n_embd_head_v;
4892
+ const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
4893
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
4894
+ GGML_ASSERT(n_embd_gqa == n_embd);
4895
+
4822
4896
  struct ggml_tensor * cur;
4823
4897
  struct ggml_tensor * pos;
4824
4898
  struct ggml_tensor * inpL;
@@ -4915,7 +4989,12 @@ struct llm_build_context {
4915
4989
  struct ggml_cgraph * build_persimmon() {
4916
4990
  struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
4917
4991
 
4918
- const int64_t n_rot = n_embd_head / 2;
4992
+ const int64_t n_embd_head = hparams.n_embd_head_v;
4993
+ const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
4994
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
4995
+ GGML_ASSERT(n_embd_gqa == n_embd);
4996
+
4997
+ const int64_t n_rot = n_embd_head_k / 2;
4919
4998
 
4920
4999
  struct ggml_tensor * cur;
4921
5000
  struct ggml_tensor * inpL;
@@ -5124,6 +5203,11 @@ struct llm_build_context {
5124
5203
  struct ggml_cgraph * build_refact() {
5125
5204
  struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
5126
5205
 
5206
+ const int64_t n_embd_head = hparams.n_embd_head_v;
5207
+ const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
5208
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
5209
+ GGML_ASSERT(n_embd_gqa == n_embd);
5210
+
5127
5211
  struct ggml_tensor * cur;
5128
5212
  struct ggml_tensor * inpL;
5129
5213
 
@@ -5212,6 +5296,11 @@ struct llm_build_context {
5212
5296
  struct ggml_cgraph * build_bloom() {
5213
5297
  struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
5214
5298
 
5299
+ const int64_t n_embd_head = hparams.n_embd_head_v;
5300
+ const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
5301
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
5302
+ GGML_ASSERT(n_embd_gqa == n_embd);
5303
+
5215
5304
  struct ggml_tensor * cur;
5216
5305
  struct ggml_tensor * inpL;
5217
5306
 
@@ -5303,6 +5392,11 @@ struct llm_build_context {
5303
5392
  struct ggml_cgraph * build_mpt() {
5304
5393
  struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
5305
5394
 
5395
+ const int64_t n_embd_head = hparams.n_embd_head_v;
5396
+ const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
5397
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
5398
+ GGML_ASSERT(n_embd_gqa == n_embd);
5399
+
5306
5400
  struct ggml_tensor * cur;
5307
5401
  struct ggml_tensor * inpL;
5308
5402
 
@@ -5398,6 +5492,9 @@ struct llm_build_context {
5398
5492
  struct ggml_cgraph * build_stablelm() {
5399
5493
  struct ggml_cgraph * gf = ggml_new_graph(ctx0);
5400
5494
 
5495
+ const int64_t n_embd_head = hparams.n_embd_head_v;
5496
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
5497
+
5401
5498
  struct ggml_tensor * cur;
5402
5499
  struct ggml_tensor * inpL;
5403
5500
 
@@ -5508,6 +5605,9 @@ struct llm_build_context {
5508
5605
  struct ggml_cgraph * build_qwen() {
5509
5606
  struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
5510
5607
 
5608
+ const int64_t n_embd_head = hparams.n_embd_head_v;
5609
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
5610
+
5511
5611
  struct ggml_tensor * cur;
5512
5612
  struct ggml_tensor * inpL;
5513
5613
 
@@ -5619,6 +5719,11 @@ struct llm_build_context {
5619
5719
  struct ggml_cgraph * build_phi2() {
5620
5720
  struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
5621
5721
 
5722
+ const int64_t n_embd_head = hparams.n_embd_head_v;
5723
+ const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
5724
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
5725
+ GGML_ASSERT(n_embd_gqa == n_embd);
5726
+
5622
5727
  struct ggml_tensor * cur;
5623
5728
  struct ggml_tensor * attn_norm_output;
5624
5729
  struct ggml_tensor * ffn_output;
@@ -5731,6 +5836,9 @@ struct llm_build_context {
5731
5836
  struct ggml_cgraph * build_plamo() {
5732
5837
  struct ggml_cgraph * gf = ggml_new_graph(ctx0);
5733
5838
 
5839
+ const int64_t n_embd_head = hparams.n_embd_head_v;
5840
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
5841
+
5734
5842
  struct ggml_tensor * cur;
5735
5843
  struct ggml_tensor * inpL;
5736
5844
 
@@ -5835,6 +5943,11 @@ struct llm_build_context {
5835
5943
  struct ggml_cgraph * build_gpt2() {
5836
5944
  struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
5837
5945
 
5946
+ const int64_t n_embd_head = hparams.n_embd_head_v;
5947
+ const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
5948
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
5949
+ GGML_ASSERT(n_embd_gqa == n_embd);
5950
+
5838
5951
  struct ggml_tensor * cur;
5839
5952
  struct ggml_tensor * pos;
5840
5953
  struct ggml_tensor * inpL;
@@ -7912,7 +8025,7 @@ void llama_sample_softmax(struct llama_context * ctx, llama_token_data_array * c
7912
8025
  }
7913
8026
  }
7914
8027
 
7915
- void llama_sample_top_k(struct llama_context * ctx, llama_token_data_array * candidates, int k, size_t min_keep) {
8028
+ void llama_sample_top_k(struct llama_context * ctx, llama_token_data_array * candidates, int32_t k, size_t min_keep) {
7916
8029
  const int64_t t_start_sample_us = ggml_time_us();
7917
8030
 
7918
8031
  k = std::max(k, (int) min_keep);
@@ -8272,7 +8385,7 @@ void llama_sample_classifier_free_guidance(
8272
8385
  }
8273
8386
  }
8274
8387
 
8275
- llama_token llama_sample_token_mirostat(struct llama_context * ctx, llama_token_data_array * candidates, float tau, float eta, int m, float * mu) {
8388
+ llama_token llama_sample_token_mirostat(struct llama_context * ctx, llama_token_data_array * candidates, float tau, float eta, int32_t m, float * mu) {
8276
8389
  GGML_ASSERT(ctx);
8277
8390
 
8278
8391
  auto N = float(llama_n_vocab(llama_get_model(ctx)));
@@ -9480,7 +9593,7 @@ struct llama_model_quantize_params llama_model_quantize_default_params() {
9480
9593
  return result;
9481
9594
  }
9482
9595
 
9483
- int llama_max_devices(void) {
9596
+ int32_t llama_max_devices(void) {
9484
9597
  return LLAMA_MAX_DEVICES;
9485
9598
  }
9486
9599
 
@@ -9622,8 +9735,8 @@ struct llama_context * llama_new_context_with_model(
9622
9735
  const ggml_type type_k = params.type_k;
9623
9736
  const ggml_type type_v = params.type_v;
9624
9737
 
9625
- GGML_ASSERT(hparams.n_embd_head() % ggml_blck_size(type_k) == 0);
9626
- GGML_ASSERT(hparams.n_embd_head() % ggml_blck_size(type_v) == 0);
9738
+ GGML_ASSERT(hparams.n_embd_head_k % ggml_blck_size(type_k) == 0);
9739
+ GGML_ASSERT(hparams.n_embd_head_v % ggml_blck_size(type_v) == 0);
9627
9740
 
9628
9741
  // reserve memory for context buffers
9629
9742
  if (!hparams.vocab_only) {
@@ -9791,15 +9904,15 @@ enum llama_vocab_type llama_vocab_type(const struct llama_model * model) {
9791
9904
  return model->vocab.type;
9792
9905
  }
9793
9906
 
9794
- int llama_n_vocab(const struct llama_model * model) {
9907
+ int32_t llama_n_vocab(const struct llama_model * model) {
9795
9908
  return model->vocab.id_to_token.size();
9796
9909
  }
9797
9910
 
9798
- int llama_n_ctx_train(const struct llama_model * model) {
9911
+ int32_t llama_n_ctx_train(const struct llama_model * model) {
9799
9912
  return model->hparams.n_ctx_train;
9800
9913
  }
9801
9914
 
9802
- int llama_n_embd(const struct llama_model * model) {
9915
+ int32_t llama_n_embd(const struct llama_model * model) {
9803
9916
  return model->hparams.n_embd;
9804
9917
  }
9805
9918
 
@@ -9807,7 +9920,7 @@ float llama_rope_freq_scale_train(const struct llama_model * model) {
9807
9920
  return model->hparams.rope_freq_scale_train;
9808
9921
  }
9809
9922
 
9810
- int llama_model_meta_val_str(const struct llama_model * model, const char * key, char * buf, size_t buf_size) {
9923
+ int32_t llama_model_meta_val_str(const struct llama_model * model, const char * key, char * buf, size_t buf_size) {
9811
9924
  const auto & it = model->gguf_kv.find(key);
9812
9925
  if (it == model->gguf_kv.end()) {
9813
9926
  if (buf_size > 0) {
@@ -9818,11 +9931,11 @@ int llama_model_meta_val_str(const struct llama_model * model, const char * key,
9818
9931
  return snprintf(buf, buf_size, "%s", it->second.c_str());
9819
9932
  }
9820
9933
 
9821
- int llama_model_meta_count(const struct llama_model * model) {
9934
+ int32_t llama_model_meta_count(const struct llama_model * model) {
9822
9935
  return (int)model->gguf_kv.size();
9823
9936
  }
9824
9937
 
9825
- int llama_model_meta_key_by_index(const struct llama_model * model, int i, char * buf, size_t buf_size) {
9938
+ int32_t llama_model_meta_key_by_index(const struct llama_model * model, int i, char * buf, size_t buf_size) {
9826
9939
  if (i < 0 || i >= (int)model->gguf_kv.size()) {
9827
9940
  if (buf_size > 0) {
9828
9941
  buf[0] = '\0';
@@ -9834,7 +9947,7 @@ int llama_model_meta_key_by_index(const struct llama_model * model, int i, char
9834
9947
  return snprintf(buf, buf_size, "%s", it->first.c_str());
9835
9948
  }
9836
9949
 
9837
- int llama_model_meta_val_str_by_index(const struct llama_model * model, int i, char * buf, size_t buf_size) {
9950
+ int32_t llama_model_meta_val_str_by_index(const struct llama_model * model, int32_t i, char * buf, size_t buf_size) {
9838
9951
  if (i < 0 || i >= (int)model->gguf_kv.size()) {
9839
9952
  if (buf_size > 0) {
9840
9953
  buf[0] = '\0';
@@ -9846,9 +9959,10 @@ int llama_model_meta_val_str_by_index(const struct llama_model * model, int i, c
9846
9959
  return snprintf(buf, buf_size, "%s", it->second.c_str());
9847
9960
  }
9848
9961
 
9849
- int llama_model_desc(const struct llama_model * model, char * buf, size_t buf_size) {
9850
- return snprintf(buf, buf_size, "%s %s %s",
9962
+ int32_t llama_model_desc(const struct llama_model * model, char * buf, size_t buf_size) {
9963
+ return snprintf(buf, buf_size, "%s %s%s %s",
9851
9964
  llama_model_arch_name(model->arch).c_str(),
9965
+ model->hparams.n_expert > 0 ? (std::to_string(model->hparams.n_expert) + "x").c_str() : "",
9852
9966
  llama_model_type_name(model->type),
9853
9967
  llama_model_ftype_name(model->ftype).c_str());
9854
9968
  }
@@ -9873,7 +9987,7 @@ struct ggml_tensor * llama_get_model_tensor(struct llama_model * model, const ch
9873
9987
  return ggml_get_tensor(model->ctx, name);
9874
9988
  }
9875
9989
 
9876
- int llama_model_quantize(
9990
+ uint32_t llama_model_quantize(
9877
9991
  const char * fname_inp,
9878
9992
  const char * fname_out,
9879
9993
  const llama_model_quantize_params * params) {
@@ -9886,7 +10000,7 @@ int llama_model_quantize(
9886
10000
  }
9887
10001
  }
9888
10002
 
9889
- int llama_apply_lora_from_file(struct llama_context * ctx, const char * path_lora, float scale, const char * path_base_model, int n_threads) {
10003
+ int32_t llama_apply_lora_from_file(struct llama_context * ctx, const char * path_lora, float scale, const char * path_base_model, int32_t n_threads) {
9890
10004
  try {
9891
10005
  return llama_apply_lora_from_file_internal(ctx->model, path_lora, scale, path_base_model, n_threads);
9892
10006
  } catch (const std::exception & err) {
@@ -9895,7 +10009,7 @@ int llama_apply_lora_from_file(struct llama_context * ctx, const char * path_lor
9895
10009
  }
9896
10010
  }
9897
10011
 
9898
- int llama_model_apply_lora_from_file(const struct llama_model * model, const char * path_lora, float scale, const char * path_base_model, int n_threads) {
10012
+ int32_t llama_model_apply_lora_from_file(const struct llama_model * model, const char * path_lora, float scale, const char * path_base_model, int32_t n_threads) {
9899
10013
  try {
9900
10014
  return llama_apply_lora_from_file_internal(*model, path_lora, scale, path_base_model, n_threads);
9901
10015
  } catch (const std::exception & err) {
@@ -9993,7 +10107,7 @@ void llama_kv_cache_view_update(const struct llama_context * ctx, struct llama_k
9993
10107
  }
9994
10108
  }
9995
10109
 
9996
- int llama_get_kv_cache_token_count(const struct llama_context * ctx) {
10110
+ int32_t llama_get_kv_cache_token_count(const struct llama_context * ctx) {
9997
10111
  int result = 0;
9998
10112
 
9999
10113
  for (uint32_t i = 0; i < ctx->kv_self.size; i++) {
@@ -10003,7 +10117,7 @@ int llama_get_kv_cache_token_count(const struct llama_context * ctx) {
10003
10117
  return result;
10004
10118
  }
10005
10119
 
10006
- int llama_get_kv_cache_used_cells(const struct llama_context * ctx) {
10120
+ int32_t llama_get_kv_cache_used_cells(const struct llama_context * ctx) {
10007
10121
  return ctx->kv_self.used;
10008
10122
  }
10009
10123
 
@@ -10167,9 +10281,10 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat
10167
10281
  const auto & hparams = ctx->model.hparams;
10168
10282
  const auto & cparams = ctx->cparams;
10169
10283
 
10170
- const auto n_layer = hparams.n_layer;
10171
- const auto n_embd = hparams.n_embd_gqa();
10172
- const auto n_ctx = cparams.n_ctx;
10284
+ const auto n_layer = hparams.n_layer;
10285
+ const auto n_embd_k_gqa = hparams.n_embd_k_gqa();
10286
+ const auto n_embd_v_gqa = hparams.n_embd_v_gqa();
10287
+ const auto n_ctx = cparams.n_ctx;
10173
10288
 
10174
10289
  const size_t kv_buf_size = ggml_backend_buffer_get_size(kv_self.buf);
10175
10290
  const uint32_t kv_head = kv_self.head;
@@ -10191,15 +10306,15 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat
10191
10306
  std::vector<struct ggml_tensor *> vout2d(n_layer);
10192
10307
 
10193
10308
  for (int il = 0; il < (int) n_layer; ++il) {
10194
- kout2d[il] = ggml_new_tensor_2d(cpy_ctx, kv_self.k_l[il]->type, n_embd, kv_head);
10195
- vout2d[il] = ggml_new_tensor_2d(cpy_ctx, kv_self.v_l[il]->type, kv_head, n_embd);
10309
+ kout2d[il] = ggml_new_tensor_2d(cpy_ctx, kv_self.k_l[il]->type, n_embd_k_gqa, kv_head);
10310
+ vout2d[il] = ggml_new_tensor_2d(cpy_ctx, kv_self.v_l[il]->type, kv_head, n_embd_v_gqa);
10196
10311
 
10197
10312
  ggml_tensor * k2d = ggml_view_2d(cpy_ctx, kv_self.k_l[il],
10198
- n_embd, kv_head,
10199
- elt_size*n_embd, 0);
10313
+ n_embd_k_gqa, kv_head,
10314
+ elt_size*n_embd_k_gqa, 0);
10200
10315
 
10201
10316
  ggml_tensor * v2d = ggml_view_2d(cpy_ctx, kv_self.v_l[il],
10202
- kv_head, n_embd,
10317
+ kv_head, n_embd_v_gqa,
10203
10318
  elt_size*n_ctx, 0);
10204
10319
 
10205
10320
  ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, k2d, kout2d[il]));
@@ -10306,9 +10421,10 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
10306
10421
  const auto & hparams = ctx->model.hparams;
10307
10422
  const auto & cparams = ctx->cparams;
10308
10423
 
10309
- const int n_layer = hparams.n_layer;
10310
- const int n_embd = hparams.n_embd_gqa();
10311
- const int n_ctx = cparams.n_ctx;
10424
+ const int n_layer = hparams.n_layer;
10425
+ const int n_embd_k_gqa = hparams.n_embd_k_gqa();
10426
+ const int n_embd_v_gqa = hparams.n_embd_v_gqa();
10427
+ const int n_ctx = cparams.n_ctx;
10312
10428
 
10313
10429
  size_t kv_buf_size;
10314
10430
  uint32_t kv_head;
@@ -10332,15 +10448,15 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
10332
10448
  std::vector<struct ggml_tensor *> vin2d(n_layer);
10333
10449
 
10334
10450
  for (int il = 0; il < n_layer; ++il) {
10335
- kin2d[il] = ggml_new_tensor_2d(cpy_ctx, kv_self.k_l[il]->type, n_embd, kv_head);
10336
- vin2d[il] = ggml_new_tensor_2d(cpy_ctx, kv_self.v_l[il]->type, kv_head, n_embd);
10451
+ kin2d[il] = ggml_new_tensor_2d(cpy_ctx, kv_self.k_l[il]->type, n_embd_k_gqa, kv_head);
10452
+ vin2d[il] = ggml_new_tensor_2d(cpy_ctx, kv_self.v_l[il]->type, kv_head, n_embd_v_gqa);
10337
10453
 
10338
10454
  ggml_tensor * k2d = ggml_view_2d(cpy_ctx, kv_self.k_l[il],
10339
- n_embd, kv_head,
10340
- elt_size*n_embd, 0);
10455
+ n_embd_k_gqa, kv_head,
10456
+ elt_size*n_embd_k_gqa, 0);
10341
10457
 
10342
10458
  ggml_tensor * v2d = ggml_view_2d(cpy_ctx, kv_self.v_l[il],
10343
- kv_head, n_embd,
10459
+ kv_head, n_embd_v_gqa,
10344
10460
  elt_size*n_ctx, 0);
10345
10461
 
10346
10462
  ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, kin2d[il], k2d));
@@ -10483,7 +10599,7 @@ int llama_eval(
10483
10599
  struct llama_context * ctx,
10484
10600
  llama_token * tokens,
10485
10601
  int32_t n_tokens,
10486
- int n_past) {
10602
+ int32_t n_past) {
10487
10603
  llama_kv_cache_seq_rm(ctx->kv_self, -1, n_past, -1);
10488
10604
 
10489
10605
  const int ret = llama_decode_internal(*ctx, llama_batch_get_one(tokens, n_tokens, n_past, 0));
@@ -10498,7 +10614,7 @@ int llama_eval_embd(
10498
10614
  struct llama_context * ctx,
10499
10615
  float * embd,
10500
10616
  int32_t n_tokens,
10501
- int n_past) {
10617
+ int32_t n_past) {
10502
10618
  llama_kv_cache_seq_rm(ctx->kv_self, -1, n_past, -1);
10503
10619
 
10504
10620
  llama_batch batch = { n_tokens, nullptr, embd, nullptr, nullptr, nullptr, nullptr, n_past, 1, 0, };
@@ -10569,7 +10685,7 @@ void llama_batch_free(struct llama_batch batch) {
10569
10685
  if (batch.logits) free(batch.logits);
10570
10686
  }
10571
10687
 
10572
- int llama_decode(
10688
+ int32_t llama_decode(
10573
10689
  struct llama_context * ctx,
10574
10690
  struct llama_batch batch) {
10575
10691
  const int ret = llama_decode_internal(*ctx, batch);
@@ -10617,11 +10733,11 @@ llama_token llama_token_nl(const struct llama_model * model) {
10617
10733
  return model->vocab.linefeed_id;
10618
10734
  }
10619
10735
 
10620
- int llama_add_bos_token(const struct llama_model * model) {
10736
+ int32_t llama_add_bos_token(const struct llama_model * model) {
10621
10737
  return model->vocab.special_add_bos;
10622
10738
  }
10623
10739
 
10624
- int llama_add_eos_token(const struct llama_model * model) {
10740
+ int32_t llama_add_eos_token(const struct llama_model * model) {
10625
10741
  return model->vocab.special_add_eos;
10626
10742
  }
10627
10743
 
@@ -10641,12 +10757,12 @@ llama_token llama_token_eot(const struct llama_model * model) {
10641
10757
  return model->vocab.special_eot_id;
10642
10758
  }
10643
10759
 
10644
- int llama_tokenize(
10760
+ int32_t llama_tokenize(
10645
10761
  const struct llama_model * model,
10646
10762
  const char * text,
10647
- int text_len,
10763
+ int32_t text_len,
10648
10764
  llama_token * tokens,
10649
- int n_max_tokens,
10765
+ int32_t n_max_tokens,
10650
10766
  bool add_bos,
10651
10767
  bool special) {
10652
10768
  auto res = llama_tokenize_internal(model->vocab, std::string(text, text_len), add_bos, special);
@@ -10674,7 +10790,7 @@ static std::string llama_decode_text(const std::string & text) {
10674
10790
  }
10675
10791
 
10676
10792
  // does not write null-terminator to buf
10677
- int llama_token_to_piece(const struct llama_model * model, llama_token token, char * buf, int length) {
10793
+ int32_t llama_token_to_piece(const struct llama_model * model, llama_token token, char * buf, int32_t length) {
10678
10794
  if (0 <= token && token < llama_n_vocab(model)) {
10679
10795
  switch (llama_vocab_get_type(model->vocab)) {
10680
10796
  case LLAMA_VOCAB_TYPE_SPM: {
@@ -10775,6 +10891,7 @@ const char * llama_print_system_info(void) {
10775
10891
 
10776
10892
  s = "";
10777
10893
  s += "AVX = " + std::to_string(ggml_cpu_has_avx()) + " | ";
10894
+ s += "AVX_VNNI = " + std::to_string(ggml_cpu_has_avx_vnni()) + " | ";
10778
10895
  s += "AVX2 = " + std::to_string(ggml_cpu_has_avx2()) + " | ";
10779
10896
  s += "AVX512 = " + std::to_string(ggml_cpu_has_avx512()) + " | ";
10780
10897
  s += "AVX512_VBMI = " + std::to_string(ggml_cpu_has_avx512_vbmi()) + " | ";