llama_cpp 0.10.3 → 0.11.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (37) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +13 -0
  3. data/LICENSE.txt +1 -1
  4. data/ext/llama_cpp/extconf.rb +35 -110
  5. data/ext/llama_cpp/llama_cpp.cpp +52 -28
  6. data/lib/llama_cpp/version.rb +2 -2
  7. data/sig/llama_cpp.rbs +3 -1
  8. data/vendor/include/.gitkeep +0 -0
  9. data/vendor/lib/.gitkeep +0 -0
  10. data/vendor/tmp/llama.cpp/Makefile +758 -0
  11. data/{ext/llama_cpp/src → vendor/tmp/llama.cpp}/ggml-backend.c +6 -2
  12. data/{ext/llama_cpp/src → vendor/tmp/llama.cpp}/ggml-cuda.cu +73 -63
  13. data/{ext/llama_cpp/src → vendor/tmp/llama.cpp}/ggml-impl.h +1 -0
  14. data/{ext/llama_cpp/src → vendor/tmp/llama.cpp}/ggml-metal.m +43 -20
  15. data/{ext/llama_cpp/src → vendor/tmp/llama.cpp}/ggml-metal.metal +464 -245
  16. data/vendor/tmp/llama.cpp/ggml-opencl.h +25 -0
  17. data/{ext/llama_cpp/src → vendor/tmp/llama.cpp}/ggml-quants.c +61 -57
  18. data/{ext/llama_cpp/src → vendor/tmp/llama.cpp}/ggml.c +171 -5
  19. data/{ext/llama_cpp/src → vendor/tmp/llama.cpp}/ggml.h +1 -0
  20. data/{ext/llama_cpp/src → vendor/tmp/llama.cpp}/llama.cpp +222 -105
  21. data/{ext/llama_cpp/src → vendor/tmp/llama.cpp}/llama.h +31 -32
  22. data/vendor/tmp/llama.cpp/scripts/get-flags.mk +38 -0
  23. metadata +30 -27
  24. data/ext/llama_cpp/src/ggml-opencl.h +0 -25
  25. data/ext/llama_cpp/src/llama-util.h +0 -546
  26. /data/{ext/llama_cpp/src → vendor/tmp/llama.cpp}/LICENSE +0 -0
  27. /data/{ext/llama_cpp/src → vendor/tmp/llama.cpp}/ggml-alloc.c +0 -0
  28. /data/{ext/llama_cpp/src → vendor/tmp/llama.cpp}/ggml-alloc.h +0 -0
  29. /data/{ext/llama_cpp/src → vendor/tmp/llama.cpp}/ggml-backend-impl.h +0 -0
  30. /data/{ext/llama_cpp/src → vendor/tmp/llama.cpp}/ggml-backend.h +0 -0
  31. /data/{ext/llama_cpp/src → vendor/tmp/llama.cpp}/ggml-cuda.h +0 -0
  32. /data/{ext/llama_cpp/src → vendor/tmp/llama.cpp}/ggml-metal.h +0 -0
  33. /data/{ext/llama_cpp/src → vendor/tmp/llama.cpp}/ggml-mpi.c +0 -0
  34. /data/{ext/llama_cpp/src → vendor/tmp/llama.cpp}/ggml-mpi.h +0 -0
  35. /data/{ext/llama_cpp/src → vendor/tmp/llama.cpp}/ggml-opencl.cpp +0 -0
  36. /data/{ext/llama_cpp/src → vendor/tmp/llama.cpp}/ggml-quants.h +0 -0
  37. /data/{ext/llama_cpp/src → vendor/tmp/llama.cpp}/unicode.h +0 -0
@@ -245,6 +245,8 @@ enum llm_kv {
245
245
  LLM_KV_ATTENTION_HEAD_COUNT_KV,
246
246
  LLM_KV_ATTENTION_MAX_ALIBI_BIAS,
247
247
  LLM_KV_ATTENTION_CLAMP_KQV,
248
+ LLM_KV_ATTENTION_KEY_LENGTH,
249
+ LLM_KV_ATTENTION_VALUE_LENGTH,
248
250
  LLM_KV_ATTENTION_LAYERNORM_EPS,
249
251
  LLM_KV_ATTENTION_LAYERNORM_RMS_EPS,
250
252
 
@@ -297,6 +299,8 @@ static std::map<llm_kv, std::string> LLM_KV_NAMES = {
297
299
  { LLM_KV_ATTENTION_HEAD_COUNT_KV, "%s.attention.head_count_kv" },
298
300
  { LLM_KV_ATTENTION_MAX_ALIBI_BIAS, "%s.attention.max_alibi_bias" },
299
301
  { LLM_KV_ATTENTION_CLAMP_KQV, "%s.attention.clamp_kqv" },
302
+ { LLM_KV_ATTENTION_KEY_LENGTH, "%s.attention.key_length" },
303
+ { LLM_KV_ATTENTION_VALUE_LENGTH, "%s.attention.value_length" },
300
304
  { LLM_KV_ATTENTION_LAYERNORM_EPS, "%s.attention.layer_norm_epsilon" },
301
305
  { LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, "%s.attention.layer_norm_rms_epsilon" },
302
306
 
@@ -1279,6 +1283,8 @@ struct llama_hparams {
1279
1283
  uint32_t n_head_kv;
1280
1284
  uint32_t n_layer;
1281
1285
  uint32_t n_rot;
1286
+ uint32_t n_embd_head_k; // dimension of keys (d_k). d_q is assumed to be the same, but there are n_head q heads, and only n_head_kv k-v heads
1287
+ uint32_t n_embd_head_v; // dimension of values (d_v) aka n_embd_head
1282
1288
  uint32_t n_ff;
1283
1289
  uint32_t n_expert = 0;
1284
1290
  uint32_t n_expert_used = 0;
@@ -1305,6 +1311,8 @@ struct llama_hparams {
1305
1311
  if (this->n_head_kv != other.n_head_kv) return true;
1306
1312
  if (this->n_layer != other.n_layer) return true;
1307
1313
  if (this->n_rot != other.n_rot) return true;
1314
+ if (this->n_embd_head_k != other.n_embd_head_k) return true;
1315
+ if (this->n_embd_head_v != other.n_embd_head_v) return true;
1308
1316
  if (this->n_ff != other.n_ff) return true;
1309
1317
  if (this->n_expert != other.n_expert) return true;
1310
1318
  if (this->n_expert_used != other.n_expert_used) return true;
@@ -1326,12 +1334,12 @@ struct llama_hparams {
1326
1334
  return n_head/n_head_kv;
1327
1335
  }
1328
1336
 
1329
- uint32_t n_embd_head() const {
1330
- return n_embd/n_head;
1337
+ uint32_t n_embd_k_gqa() const { // dimension of key embeddings across all k-v heads
1338
+ return n_embd_head_k * n_head_kv;
1331
1339
  }
1332
1340
 
1333
- uint32_t n_embd_gqa() const {
1334
- return n_embd/n_gqa();
1341
+ uint32_t n_embd_v_gqa() const { // dimension of value embeddings across all k-v heads
1342
+ return n_embd_head_v * n_head_kv;
1335
1343
  }
1336
1344
  };
1337
1345
 
@@ -1640,8 +1648,9 @@ static bool llama_kv_cache_init(
1640
1648
  uint32_t n_ctx,
1641
1649
  int n_gpu_layers,
1642
1650
  bool offload) {
1643
- const uint32_t n_embd = hparams.n_embd_gqa();
1644
- const uint32_t n_layer = hparams.n_layer;
1651
+ const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa();
1652
+ const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa();
1653
+ const uint32_t n_layer = hparams.n_layer;
1645
1654
 
1646
1655
  cache.has_shift = false;
1647
1656
 
@@ -1672,8 +1681,8 @@ static bool llama_kv_cache_init(
1672
1681
  const int i_gpu_start = (int) n_layer - n_gpu_layers;
1673
1682
 
1674
1683
  for (int i = 0; i < (int) n_layer; i++) {
1675
- ggml_tensor * k = ggml_new_tensor_1d(cache.ctx, ktype, n_embd*n_ctx);
1676
- ggml_tensor * v = ggml_new_tensor_1d(cache.ctx, vtype, n_embd*n_ctx);
1684
+ ggml_tensor * k = ggml_new_tensor_1d(cache.ctx, ktype, n_embd_k_gqa*n_ctx);
1685
+ ggml_tensor * v = ggml_new_tensor_1d(cache.ctx, vtype, n_embd_v_gqa*n_ctx);
1677
1686
  ggml_format_name(k, "cache_k_l%d", i);
1678
1687
  ggml_format_name(v, "cache_v_l%d", i);
1679
1688
  cache.k_l.push_back(k);
@@ -2667,6 +2676,12 @@ static void llm_load_hparams(
2667
2676
  // gpt-j n_rot = rotary_dim
2668
2677
  }
2669
2678
 
2679
+ hparams.n_embd_head_k = hparams.n_embd / hparams.n_head;
2680
+ ml.get_key(LLM_KV_ATTENTION_KEY_LENGTH, hparams.n_embd_head_k, false);
2681
+
2682
+ hparams.n_embd_head_v = hparams.n_embd / hparams.n_head;
2683
+ ml.get_key(LLM_KV_ATTENTION_VALUE_LENGTH, hparams.n_embd_head_v, false);
2684
+
2670
2685
  // arch-specific KVs
2671
2686
  switch (model.arch) {
2672
2687
  case LLM_ARCH_LLAMA:
@@ -3077,8 +3092,12 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
3077
3092
  LLAMA_LOG_INFO("%s: n_head = %u\n", __func__, hparams.n_head);
3078
3093
  LLAMA_LOG_INFO("%s: n_head_kv = %u\n", __func__, hparams.n_head_kv);
3079
3094
  LLAMA_LOG_INFO("%s: n_layer = %u\n", __func__, hparams.n_layer);
3080
- LLAMA_LOG_INFO("%s: n_rot = %u\n", __func__, hparams.n_rot); // a.k.a. n_embd_head, n_head_dim
3095
+ LLAMA_LOG_INFO("%s: n_rot = %u\n", __func__, hparams.n_rot);
3096
+ LLAMA_LOG_INFO("%s: n_embd_head_k = %u\n", __func__, hparams.n_embd_head_k);
3097
+ LLAMA_LOG_INFO("%s: n_embd_head_v = %u\n", __func__, hparams.n_embd_head_v);
3081
3098
  LLAMA_LOG_INFO("%s: n_gqa = %u\n", __func__, hparams.n_gqa());
3099
+ LLAMA_LOG_INFO("%s: n_embd_k_gqa = %u\n", __func__, hparams.n_embd_k_gqa());
3100
+ LLAMA_LOG_INFO("%s: n_embd_v_gqa = %u\n", __func__, hparams.n_embd_v_gqa());
3082
3101
  LLAMA_LOG_INFO("%s: f_norm_eps = %.1e\n", __func__, hparams.f_norm_eps);
3083
3102
  LLAMA_LOG_INFO("%s: f_norm_rms_eps = %.1e\n", __func__, hparams.f_norm_rms_eps);
3084
3103
  LLAMA_LOG_INFO("%s: f_clamp_kqv = %.1e\n", __func__, hparams.f_clamp_kqv);
@@ -3168,10 +3187,11 @@ static bool llm_load_tensors(
3168
3187
 
3169
3188
  // create tensors for the weights
3170
3189
  {
3171
- const int64_t n_embd = hparams.n_embd;
3172
- const int64_t n_embd_gqa = hparams.n_embd_gqa();
3173
- const int64_t n_layer = hparams.n_layer;
3174
- const int64_t n_vocab = hparams.n_vocab;
3190
+ const int64_t n_embd = hparams.n_embd;
3191
+ const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa();
3192
+ const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa();
3193
+ const int64_t n_layer = hparams.n_layer;
3194
+ const int64_t n_vocab = hparams.n_vocab;
3175
3195
 
3176
3196
  const auto tn = LLM_TN(model.arch);
3177
3197
  switch (model.arch) {
@@ -3197,7 +3217,10 @@ static bool llm_load_tensors(
3197
3217
  model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
3198
3218
  }
3199
3219
 
3200
- const uint32_t n_ff = hparams.n_ff;
3220
+ const uint32_t n_ff = hparams.n_ff;
3221
+ const int64_t n_embd_gqa = n_embd_v_gqa;
3222
+ GGML_ASSERT(n_embd_gqa == n_embd / hparams.n_gqa());
3223
+ GGML_ASSERT(n_embd_gqa == n_embd_k_gqa);
3201
3224
 
3202
3225
  const int i_gpu_start = n_layer - n_gpu_layers;
3203
3226
 
@@ -3265,7 +3288,10 @@ static bool llm_load_tensors(
3265
3288
  model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
3266
3289
  }
3267
3290
 
3268
- const uint32_t n_ff = hparams.n_ff;
3291
+ const uint32_t n_ff = hparams.n_ff;
3292
+ const int64_t n_embd_gqa = n_embd_v_gqa;
3293
+ GGML_ASSERT(n_embd_gqa == n_embd / hparams.n_gqa());
3294
+ GGML_ASSERT(n_embd_gqa == n_embd_k_gqa);
3269
3295
 
3270
3296
  const int i_gpu_start = n_layer - n_gpu_layers;
3271
3297
 
@@ -3313,7 +3339,10 @@ static bool llm_load_tensors(
3313
3339
  model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
3314
3340
  }
3315
3341
 
3316
- const uint32_t n_ff = hparams.n_ff;
3342
+ const uint32_t n_ff = hparams.n_ff;
3343
+ const int64_t n_embd_gqa = n_embd_v_gqa;
3344
+ GGML_ASSERT(n_embd_gqa == n_embd / hparams.n_gqa());
3345
+ GGML_ASSERT(n_embd_gqa == n_embd_k_gqa);
3317
3346
 
3318
3347
  const int i_gpu_start = n_layer - n_gpu_layers;
3319
3348
 
@@ -3363,7 +3392,10 @@ static bool llm_load_tensors(
3363
3392
  model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
3364
3393
  }
3365
3394
 
3366
- const uint32_t n_ff = hparams.n_ff;
3395
+ const uint32_t n_ff = hparams.n_ff;
3396
+ const int64_t n_embd_gqa = n_embd_v_gqa;
3397
+ GGML_ASSERT(n_embd_gqa == n_embd / hparams.n_gqa());
3398
+ GGML_ASSERT(n_embd_gqa == n_embd_k_gqa);
3367
3399
 
3368
3400
  const int i_gpu_start = n_layer - n_gpu_layers;
3369
3401
 
@@ -3415,7 +3447,11 @@ static bool llm_load_tensors(
3415
3447
  model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
3416
3448
  }
3417
3449
 
3418
- const uint32_t n_ff = hparams.n_ff;
3450
+ const uint32_t n_ff = hparams.n_ff;
3451
+ const int64_t n_embd_gqa = n_embd_v_gqa;
3452
+ GGML_ASSERT(n_embd_gqa == n_embd / hparams.n_gqa());
3453
+ GGML_ASSERT(n_embd_gqa == n_embd_k_gqa);
3454
+
3419
3455
  const int i_gpu_start = n_layer - n_gpu_layers;
3420
3456
  model.layers.resize(n_layer);
3421
3457
  for (uint32_t i = 0; i < n_layer; ++i) {
@@ -3464,7 +3500,10 @@ static bool llm_load_tensors(
3464
3500
  model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
3465
3501
  }
3466
3502
 
3467
- const uint32_t n_ff = hparams.n_ff;
3503
+ const uint32_t n_ff = hparams.n_ff;
3504
+ const int64_t n_embd_gqa = n_embd_v_gqa;
3505
+ GGML_ASSERT(n_embd_gqa == n_embd / hparams.n_gqa());
3506
+ GGML_ASSERT(n_embd_gqa == n_embd_k_gqa);
3468
3507
 
3469
3508
  const int i_gpu_start = n_layer - n_gpu_layers;
3470
3509
 
@@ -3515,7 +3554,10 @@ static bool llm_load_tensors(
3515
3554
  model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
3516
3555
  }
3517
3556
 
3518
- const uint32_t n_ff = hparams.n_ff;
3557
+ const uint32_t n_ff = hparams.n_ff;
3558
+ const int64_t n_embd_gqa = n_embd_v_gqa;
3559
+ GGML_ASSERT(n_embd_gqa == n_embd / hparams.n_gqa());
3560
+ GGML_ASSERT(n_embd_gqa == n_embd_k_gqa);
3519
3561
 
3520
3562
  const int i_gpu_start = n_layer - n_gpu_layers;
3521
3563
 
@@ -3562,7 +3604,10 @@ static bool llm_load_tensors(
3562
3604
  model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
3563
3605
  }
3564
3606
 
3565
- const uint32_t n_ff = hparams.n_ff;
3607
+ const uint32_t n_ff = hparams.n_ff;
3608
+ const int64_t n_embd_gqa = n_embd_v_gqa;
3609
+ GGML_ASSERT(n_embd_gqa == n_embd / hparams.n_gqa());
3610
+ GGML_ASSERT(n_embd_gqa == n_embd_k_gqa);
3566
3611
 
3567
3612
  const int i_gpu_start = n_layer - n_gpu_layers;
3568
3613
 
@@ -3660,7 +3705,10 @@ static bool llm_load_tensors(
3660
3705
  model.output_b = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "bias"), {n_vocab}, backend_output);
3661
3706
  }
3662
3707
 
3663
- const uint32_t n_ff = hparams.n_ff;
3708
+ const uint32_t n_ff = hparams.n_ff;
3709
+ const int64_t n_embd_gqa = n_embd_v_gqa;
3710
+ GGML_ASSERT(n_embd_gqa == n_embd / hparams.n_gqa());
3711
+ GGML_ASSERT(n_embd_gqa == n_embd_k_gqa);
3664
3712
 
3665
3713
  const int i_gpu_start = n_layer - n_gpu_layers;
3666
3714
 
@@ -3709,7 +3757,10 @@ static bool llm_load_tensors(
3709
3757
  model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
3710
3758
  }
3711
3759
 
3712
- const uint32_t n_ff = hparams.n_ff;
3760
+ const uint32_t n_ff = hparams.n_ff;
3761
+ const int64_t n_embd_gqa = n_embd_v_gqa;
3762
+ GGML_ASSERT(n_embd_gqa == n_embd / hparams.n_gqa());
3763
+ GGML_ASSERT(n_embd_gqa == n_embd_k_gqa);
3713
3764
 
3714
3765
  const int i_gpu_start = n_layer - n_gpu_layers;
3715
3766
 
@@ -3756,7 +3807,10 @@ static bool llm_load_tensors(
3756
3807
  model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
3757
3808
  }
3758
3809
 
3759
- const uint32_t n_ff = hparams.n_ff;
3810
+ const uint32_t n_ff = hparams.n_ff;
3811
+ const int64_t n_embd_gqa = n_embd_v_gqa;
3812
+ GGML_ASSERT(n_embd_gqa == n_embd / hparams.n_gqa());
3813
+ GGML_ASSERT(n_embd_gqa == n_embd_k_gqa);
3760
3814
 
3761
3815
  const int i_gpu_start = n_layer - n_gpu_layers;
3762
3816
 
@@ -3995,8 +4049,8 @@ static struct ggml_tensor * llm_build_inp_embd(
3995
4049
  return inpL;
3996
4050
  }
3997
4051
 
3998
- // Persimmon: n_rot = n_embd_head/2
3999
- // Other: n_rot = n_embd_head
4052
+ // Persimmon: n_rot = n_embd_head_k/2
4053
+ // Other: n_rot = n_embd_head_k
4000
4054
  static void llm_build_k_shift(
4001
4055
  struct ggml_context * ctx,
4002
4056
  const llama_hparams & hparams,
@@ -4009,17 +4063,17 @@ static void llm_build_k_shift(
4009
4063
  float freq_base,
4010
4064
  float freq_scale,
4011
4065
  const llm_build_cb & cb) {
4012
- const int64_t n_layer = hparams.n_layer;
4013
- const int64_t n_head_kv = hparams.n_head_kv;
4014
- const int64_t n_embd_gqa = hparams.n_embd_gqa();
4015
- const int64_t n_embd_head = hparams.n_embd_head();
4016
- const int32_t n_orig_ctx = cparams.n_yarn_orig_ctx;
4017
- const float ext_factor = cparams.yarn_ext_factor;
4018
- const float attn_factor = cparams.yarn_attn_factor;
4019
- const float beta_fast = cparams.yarn_beta_fast;
4020
- const float beta_slow = cparams.yarn_beta_slow;
4021
-
4022
- GGML_ASSERT(n_embd_head % n_rot == 0);
4066
+ const int64_t n_layer = hparams.n_layer;
4067
+ const int64_t n_head_kv = hparams.n_head_kv;
4068
+ const int64_t n_embd_head_k = hparams.n_embd_head_k;
4069
+ const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa();
4070
+ const int32_t n_orig_ctx = cparams.n_yarn_orig_ctx;
4071
+ const float ext_factor = cparams.yarn_ext_factor;
4072
+ const float attn_factor = cparams.yarn_attn_factor;
4073
+ const float beta_fast = cparams.yarn_beta_fast;
4074
+ const float beta_slow = cparams.yarn_beta_slow;
4075
+
4076
+ GGML_ASSERT(n_embd_head_k % n_rot == 0);
4023
4077
 
4024
4078
  struct ggml_tensor * K_shift = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, n_ctx);
4025
4079
  cb(K_shift, "K_shift", -1);
@@ -4037,9 +4091,9 @@ static void llm_build_k_shift(
4037
4091
  // we rotate only the first n_rot dimensions
4038
4092
  ggml_rope_custom_inplace(ctx,
4039
4093
  ggml_view_3d(ctx, kv.k_l[il],
4040
- n_embd_head, n_head_kv, n_ctx,
4041
- ggml_row_size(kv.k_l[il]->type, n_embd_head),
4042
- ggml_row_size(kv.k_l[il]->type, n_embd_gqa),
4094
+ n_embd_head_k, n_head_kv, n_ctx,
4095
+ ggml_row_size(kv.k_l[il]->type, n_embd_head_k),
4096
+ ggml_row_size(kv.k_l[il]->type, n_embd_k_gqa),
4043
4097
  0),
4044
4098
  K_shift, n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
4045
4099
  ext_factor, attn_factor, beta_fast, beta_slow);
@@ -4060,18 +4114,19 @@ static void llm_build_kv_store(
4060
4114
  int32_t kv_head,
4061
4115
  const llm_build_cb & cb,
4062
4116
  int64_t il) {
4063
- const int64_t n_embd_gqa = hparams.n_embd_gqa();
4117
+ const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa();
4118
+ const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa();
4064
4119
 
4065
4120
  // compute the transposed [n_tokens, n_embd] V matrix
4066
- struct ggml_tensor * v_cur_t = ggml_transpose(ctx, ggml_reshape_2d(ctx, v_cur, n_embd_gqa, n_tokens));
4121
+ struct ggml_tensor * v_cur_t = ggml_transpose(ctx, ggml_reshape_2d(ctx, v_cur, n_embd_v_gqa, n_tokens));
4067
4122
  //struct ggml_tensor * v_cur_t = ggml_transpose(ctx, v_cur); // TODO: reshape above is likely not needed
4068
4123
  cb(v_cur_t, "v_cur_t", il);
4069
4124
 
4070
- struct ggml_tensor * k_cache_view = ggml_view_1d(ctx, kv.k_l[il], n_tokens*n_embd_gqa,
4071
- (ggml_row_size(kv.k_l[il]->type, n_embd_gqa))*kv_head);
4125
+ struct ggml_tensor * k_cache_view = ggml_view_1d(ctx, kv.k_l[il], n_tokens*n_embd_k_gqa,
4126
+ (ggml_row_size(kv.k_l[il]->type, n_embd_k_gqa))*kv_head);
4072
4127
  cb(k_cache_view, "k_cache_view", il);
4073
4128
 
4074
- struct ggml_tensor * v_cache_view = ggml_view_2d(ctx, kv.v_l[il], n_tokens, n_embd_gqa,
4129
+ struct ggml_tensor * v_cache_view = ggml_view_2d(ctx, kv.v_l[il], n_tokens, n_embd_v_gqa,
4075
4130
  ( n_ctx)*ggml_element_size(kv.v_l[il]),
4076
4131
  (kv_head)*ggml_element_size(kv.v_l[il]));
4077
4132
  cb(v_cache_view, "v_cache_view", il);
@@ -4221,20 +4276,20 @@ static struct ggml_tensor * llm_build_kqv(
4221
4276
  float kq_scale,
4222
4277
  const llm_build_cb & cb,
4223
4278
  int il) {
4224
- const int64_t n_embd = hparams.n_embd;
4225
- const int64_t n_head = hparams.n_head;
4226
- const int64_t n_head_kv = hparams.n_head_kv;
4227
- const int64_t n_embd_head = hparams.n_embd_head();
4228
- const int64_t n_embd_gqa = hparams.n_embd_gqa();
4279
+ const int64_t n_head = hparams.n_head;
4280
+ const int64_t n_head_kv = hparams.n_head_kv;
4281
+ const int64_t n_embd_head_k = hparams.n_embd_head_k;
4282
+ const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa();
4283
+ const int64_t n_embd_head_v = hparams.n_embd_head_v;
4229
4284
 
4230
4285
  struct ggml_tensor * q = ggml_permute(ctx, q_cur, 0, 2, 1, 3);
4231
4286
  cb(q, "q", il);
4232
4287
 
4233
4288
  struct ggml_tensor * k =
4234
4289
  ggml_view_3d(ctx, kv.k_l[il],
4235
- n_embd_head, n_kv, n_head_kv,
4236
- ggml_row_size(kv.k_l[il]->type, n_embd_gqa),
4237
- ggml_row_size(kv.k_l[il]->type, n_embd_head),
4290
+ n_embd_head_k, n_kv, n_head_kv,
4291
+ ggml_row_size(kv.k_l[il]->type, n_embd_k_gqa),
4292
+ ggml_row_size(kv.k_l[il]->type, n_embd_head_k),
4238
4293
  0);
4239
4294
  cb(k, "k", il);
4240
4295
 
@@ -4273,9 +4328,9 @@ static struct ggml_tensor * llm_build_kqv(
4273
4328
  // split cached v into n_head heads
4274
4329
  struct ggml_tensor * v =
4275
4330
  ggml_view_3d(ctx, kv.v_l[il],
4276
- n_kv, n_embd_head, n_head_kv,
4331
+ n_kv, n_embd_head_v, n_head_kv,
4277
4332
  ggml_element_size(kv.v_l[il])*n_ctx,
4278
- ggml_element_size(kv.v_l[il])*n_ctx*n_embd_head,
4333
+ ggml_element_size(kv.v_l[il])*n_ctx*n_embd_head_v,
4279
4334
  0);
4280
4335
  cb(v, "v", il);
4281
4336
 
@@ -4285,7 +4340,7 @@ static struct ggml_tensor * llm_build_kqv(
4285
4340
  struct ggml_tensor * kqv_merged = ggml_permute(ctx, kqv, 0, 2, 1, 3);
4286
4341
  cb(kqv_merged, "kqv_merged", il);
4287
4342
 
4288
- struct ggml_tensor * cur = ggml_cont_2d(ctx, kqv_merged, n_embd, n_tokens);
4343
+ struct ggml_tensor * cur = ggml_cont_2d(ctx, kqv_merged, n_embd_head_k*n_head, n_tokens);
4289
4344
  cb(cur, "kqv_merged_cont", il);
4290
4345
 
4291
4346
  cur = ggml_mul_mat(ctx, wo, cur);
@@ -4312,8 +4367,10 @@ struct llm_build_context {
4312
4367
  const int64_t n_ctx; // user-specified context size (can be different from n_ctx_train)
4313
4368
  const int64_t n_head;
4314
4369
  const int64_t n_head_kv;
4315
- const int64_t n_embd_head;
4316
- const int64_t n_embd_gqa;
4370
+ const int64_t n_embd_head_k;
4371
+ const int64_t n_embd_k_gqa;
4372
+ const int64_t n_embd_head_v;
4373
+ const int64_t n_embd_v_gqa;
4317
4374
  const int64_t n_expert;
4318
4375
  const int64_t n_expert_used;
4319
4376
 
@@ -4355,8 +4412,10 @@ struct llm_build_context {
4355
4412
  n_ctx (cparams.n_ctx),
4356
4413
  n_head (hparams.n_head),
4357
4414
  n_head_kv (hparams.n_head_kv),
4358
- n_embd_head (hparams.n_embd_head()),
4359
- n_embd_gqa (hparams.n_embd_gqa()),
4415
+ n_embd_head_k (hparams.n_embd_head_k),
4416
+ n_embd_k_gqa (hparams.n_embd_k_gqa()),
4417
+ n_embd_head_v (hparams.n_embd_head_v),
4418
+ n_embd_v_gqa (hparams.n_embd_v_gqa()),
4360
4419
  n_expert (hparams.n_expert),
4361
4420
  n_expert_used (hparams.n_expert_used),
4362
4421
  freq_base (cparams.rope_freq_base),
@@ -4399,6 +4458,8 @@ struct llm_build_context {
4399
4458
  struct ggml_cgraph * build_llama() {
4400
4459
  struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
4401
4460
 
4461
+ const int64_t n_embd_head = hparams.n_embd_head_v;
4462
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
4402
4463
  GGML_ASSERT(n_embd_head == hparams.n_rot);
4403
4464
 
4404
4465
  struct ggml_tensor * cur;
@@ -4583,6 +4644,9 @@ struct llm_build_context {
4583
4644
  struct ggml_cgraph * build_baichuan() {
4584
4645
  struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
4585
4646
 
4647
+ const int64_t n_embd_head = hparams.n_embd_head_v;
4648
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
4649
+
4586
4650
  struct ggml_tensor * cur;
4587
4651
  struct ggml_tensor * inpL;
4588
4652
 
@@ -4700,6 +4764,11 @@ struct llm_build_context {
4700
4764
  struct ggml_cgraph * build_falcon() {
4701
4765
  struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
4702
4766
 
4767
+ const int64_t n_embd_head = hparams.n_embd_head_v;
4768
+ const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
4769
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
4770
+ GGML_ASSERT(n_embd_gqa == n_embd);
4771
+
4703
4772
  struct ggml_tensor * cur;
4704
4773
  struct ggml_tensor * inpL;
4705
4774
 
@@ -4819,6 +4888,11 @@ struct llm_build_context {
4819
4888
  struct ggml_cgraph * build_starcoder() {
4820
4889
  struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
4821
4890
 
4891
+ const int64_t n_embd_head = hparams.n_embd_head_v;
4892
+ const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
4893
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
4894
+ GGML_ASSERT(n_embd_gqa == n_embd);
4895
+
4822
4896
  struct ggml_tensor * cur;
4823
4897
  struct ggml_tensor * pos;
4824
4898
  struct ggml_tensor * inpL;
@@ -4915,7 +4989,12 @@ struct llm_build_context {
4915
4989
  struct ggml_cgraph * build_persimmon() {
4916
4990
  struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
4917
4991
 
4918
- const int64_t n_rot = n_embd_head / 2;
4992
+ const int64_t n_embd_head = hparams.n_embd_head_v;
4993
+ const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
4994
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
4995
+ GGML_ASSERT(n_embd_gqa == n_embd);
4996
+
4997
+ const int64_t n_rot = n_embd_head_k / 2;
4919
4998
 
4920
4999
  struct ggml_tensor * cur;
4921
5000
  struct ggml_tensor * inpL;
@@ -5124,6 +5203,11 @@ struct llm_build_context {
5124
5203
  struct ggml_cgraph * build_refact() {
5125
5204
  struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
5126
5205
 
5206
+ const int64_t n_embd_head = hparams.n_embd_head_v;
5207
+ const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
5208
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
5209
+ GGML_ASSERT(n_embd_gqa == n_embd);
5210
+
5127
5211
  struct ggml_tensor * cur;
5128
5212
  struct ggml_tensor * inpL;
5129
5213
 
@@ -5212,6 +5296,11 @@ struct llm_build_context {
5212
5296
  struct ggml_cgraph * build_bloom() {
5213
5297
  struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
5214
5298
 
5299
+ const int64_t n_embd_head = hparams.n_embd_head_v;
5300
+ const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
5301
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
5302
+ GGML_ASSERT(n_embd_gqa == n_embd);
5303
+
5215
5304
  struct ggml_tensor * cur;
5216
5305
  struct ggml_tensor * inpL;
5217
5306
 
@@ -5303,6 +5392,11 @@ struct llm_build_context {
5303
5392
  struct ggml_cgraph * build_mpt() {
5304
5393
  struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
5305
5394
 
5395
+ const int64_t n_embd_head = hparams.n_embd_head_v;
5396
+ const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
5397
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
5398
+ GGML_ASSERT(n_embd_gqa == n_embd);
5399
+
5306
5400
  struct ggml_tensor * cur;
5307
5401
  struct ggml_tensor * inpL;
5308
5402
 
@@ -5398,6 +5492,9 @@ struct llm_build_context {
5398
5492
  struct ggml_cgraph * build_stablelm() {
5399
5493
  struct ggml_cgraph * gf = ggml_new_graph(ctx0);
5400
5494
 
5495
+ const int64_t n_embd_head = hparams.n_embd_head_v;
5496
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
5497
+
5401
5498
  struct ggml_tensor * cur;
5402
5499
  struct ggml_tensor * inpL;
5403
5500
 
@@ -5508,6 +5605,9 @@ struct llm_build_context {
5508
5605
  struct ggml_cgraph * build_qwen() {
5509
5606
  struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
5510
5607
 
5608
+ const int64_t n_embd_head = hparams.n_embd_head_v;
5609
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
5610
+
5511
5611
  struct ggml_tensor * cur;
5512
5612
  struct ggml_tensor * inpL;
5513
5613
 
@@ -5619,6 +5719,11 @@ struct llm_build_context {
5619
5719
  struct ggml_cgraph * build_phi2() {
5620
5720
  struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
5621
5721
 
5722
+ const int64_t n_embd_head = hparams.n_embd_head_v;
5723
+ const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
5724
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
5725
+ GGML_ASSERT(n_embd_gqa == n_embd);
5726
+
5622
5727
  struct ggml_tensor * cur;
5623
5728
  struct ggml_tensor * attn_norm_output;
5624
5729
  struct ggml_tensor * ffn_output;
@@ -5731,6 +5836,9 @@ struct llm_build_context {
5731
5836
  struct ggml_cgraph * build_plamo() {
5732
5837
  struct ggml_cgraph * gf = ggml_new_graph(ctx0);
5733
5838
 
5839
+ const int64_t n_embd_head = hparams.n_embd_head_v;
5840
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
5841
+
5734
5842
  struct ggml_tensor * cur;
5735
5843
  struct ggml_tensor * inpL;
5736
5844
 
@@ -5835,6 +5943,11 @@ struct llm_build_context {
5835
5943
  struct ggml_cgraph * build_gpt2() {
5836
5944
  struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
5837
5945
 
5946
+ const int64_t n_embd_head = hparams.n_embd_head_v;
5947
+ const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
5948
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
5949
+ GGML_ASSERT(n_embd_gqa == n_embd);
5950
+
5838
5951
  struct ggml_tensor * cur;
5839
5952
  struct ggml_tensor * pos;
5840
5953
  struct ggml_tensor * inpL;
@@ -7912,7 +8025,7 @@ void llama_sample_softmax(struct llama_context * ctx, llama_token_data_array * c
7912
8025
  }
7913
8026
  }
7914
8027
 
7915
- void llama_sample_top_k(struct llama_context * ctx, llama_token_data_array * candidates, int k, size_t min_keep) {
8028
+ void llama_sample_top_k(struct llama_context * ctx, llama_token_data_array * candidates, int32_t k, size_t min_keep) {
7916
8029
  const int64_t t_start_sample_us = ggml_time_us();
7917
8030
 
7918
8031
  k = std::max(k, (int) min_keep);
@@ -8272,7 +8385,7 @@ void llama_sample_classifier_free_guidance(
8272
8385
  }
8273
8386
  }
8274
8387
 
8275
- llama_token llama_sample_token_mirostat(struct llama_context * ctx, llama_token_data_array * candidates, float tau, float eta, int m, float * mu) {
8388
+ llama_token llama_sample_token_mirostat(struct llama_context * ctx, llama_token_data_array * candidates, float tau, float eta, int32_t m, float * mu) {
8276
8389
  GGML_ASSERT(ctx);
8277
8390
 
8278
8391
  auto N = float(llama_n_vocab(llama_get_model(ctx)));
@@ -9480,7 +9593,7 @@ struct llama_model_quantize_params llama_model_quantize_default_params() {
9480
9593
  return result;
9481
9594
  }
9482
9595
 
9483
- int llama_max_devices(void) {
9596
+ int32_t llama_max_devices(void) {
9484
9597
  return LLAMA_MAX_DEVICES;
9485
9598
  }
9486
9599
 
@@ -9622,8 +9735,8 @@ struct llama_context * llama_new_context_with_model(
9622
9735
  const ggml_type type_k = params.type_k;
9623
9736
  const ggml_type type_v = params.type_v;
9624
9737
 
9625
- GGML_ASSERT(hparams.n_embd_head() % ggml_blck_size(type_k) == 0);
9626
- GGML_ASSERT(hparams.n_embd_head() % ggml_blck_size(type_v) == 0);
9738
+ GGML_ASSERT(hparams.n_embd_head_k % ggml_blck_size(type_k) == 0);
9739
+ GGML_ASSERT(hparams.n_embd_head_v % ggml_blck_size(type_v) == 0);
9627
9740
 
9628
9741
  // reserve memory for context buffers
9629
9742
  if (!hparams.vocab_only) {
@@ -9791,15 +9904,15 @@ enum llama_vocab_type llama_vocab_type(const struct llama_model * model) {
9791
9904
  return model->vocab.type;
9792
9905
  }
9793
9906
 
9794
- int llama_n_vocab(const struct llama_model * model) {
9907
+ int32_t llama_n_vocab(const struct llama_model * model) {
9795
9908
  return model->vocab.id_to_token.size();
9796
9909
  }
9797
9910
 
9798
- int llama_n_ctx_train(const struct llama_model * model) {
9911
+ int32_t llama_n_ctx_train(const struct llama_model * model) {
9799
9912
  return model->hparams.n_ctx_train;
9800
9913
  }
9801
9914
 
9802
- int llama_n_embd(const struct llama_model * model) {
9915
+ int32_t llama_n_embd(const struct llama_model * model) {
9803
9916
  return model->hparams.n_embd;
9804
9917
  }
9805
9918
 
@@ -9807,7 +9920,7 @@ float llama_rope_freq_scale_train(const struct llama_model * model) {
9807
9920
  return model->hparams.rope_freq_scale_train;
9808
9921
  }
9809
9922
 
9810
- int llama_model_meta_val_str(const struct llama_model * model, const char * key, char * buf, size_t buf_size) {
9923
+ int32_t llama_model_meta_val_str(const struct llama_model * model, const char * key, char * buf, size_t buf_size) {
9811
9924
  const auto & it = model->gguf_kv.find(key);
9812
9925
  if (it == model->gguf_kv.end()) {
9813
9926
  if (buf_size > 0) {
@@ -9818,11 +9931,11 @@ int llama_model_meta_val_str(const struct llama_model * model, const char * key,
9818
9931
  return snprintf(buf, buf_size, "%s", it->second.c_str());
9819
9932
  }
9820
9933
 
9821
- int llama_model_meta_count(const struct llama_model * model) {
9934
+ int32_t llama_model_meta_count(const struct llama_model * model) {
9822
9935
  return (int)model->gguf_kv.size();
9823
9936
  }
9824
9937
 
9825
- int llama_model_meta_key_by_index(const struct llama_model * model, int i, char * buf, size_t buf_size) {
9938
+ int32_t llama_model_meta_key_by_index(const struct llama_model * model, int i, char * buf, size_t buf_size) {
9826
9939
  if (i < 0 || i >= (int)model->gguf_kv.size()) {
9827
9940
  if (buf_size > 0) {
9828
9941
  buf[0] = '\0';
@@ -9834,7 +9947,7 @@ int llama_model_meta_key_by_index(const struct llama_model * model, int i, char
9834
9947
  return snprintf(buf, buf_size, "%s", it->first.c_str());
9835
9948
  }
9836
9949
 
9837
- int llama_model_meta_val_str_by_index(const struct llama_model * model, int i, char * buf, size_t buf_size) {
9950
+ int32_t llama_model_meta_val_str_by_index(const struct llama_model * model, int32_t i, char * buf, size_t buf_size) {
9838
9951
  if (i < 0 || i >= (int)model->gguf_kv.size()) {
9839
9952
  if (buf_size > 0) {
9840
9953
  buf[0] = '\0';
@@ -9846,9 +9959,10 @@ int llama_model_meta_val_str_by_index(const struct llama_model * model, int i, c
9846
9959
  return snprintf(buf, buf_size, "%s", it->second.c_str());
9847
9960
  }
9848
9961
 
9849
- int llama_model_desc(const struct llama_model * model, char * buf, size_t buf_size) {
9850
- return snprintf(buf, buf_size, "%s %s %s",
9962
+ int32_t llama_model_desc(const struct llama_model * model, char * buf, size_t buf_size) {
9963
+ return snprintf(buf, buf_size, "%s %s%s %s",
9851
9964
  llama_model_arch_name(model->arch).c_str(),
9965
+ model->hparams.n_expert > 0 ? (std::to_string(model->hparams.n_expert) + "x").c_str() : "",
9852
9966
  llama_model_type_name(model->type),
9853
9967
  llama_model_ftype_name(model->ftype).c_str());
9854
9968
  }
@@ -9873,7 +9987,7 @@ struct ggml_tensor * llama_get_model_tensor(struct llama_model * model, const ch
9873
9987
  return ggml_get_tensor(model->ctx, name);
9874
9988
  }
9875
9989
 
9876
- int llama_model_quantize(
9990
+ uint32_t llama_model_quantize(
9877
9991
  const char * fname_inp,
9878
9992
  const char * fname_out,
9879
9993
  const llama_model_quantize_params * params) {
@@ -9886,7 +10000,7 @@ int llama_model_quantize(
9886
10000
  }
9887
10001
  }
9888
10002
 
9889
- int llama_apply_lora_from_file(struct llama_context * ctx, const char * path_lora, float scale, const char * path_base_model, int n_threads) {
10003
+ int32_t llama_apply_lora_from_file(struct llama_context * ctx, const char * path_lora, float scale, const char * path_base_model, int32_t n_threads) {
9890
10004
  try {
9891
10005
  return llama_apply_lora_from_file_internal(ctx->model, path_lora, scale, path_base_model, n_threads);
9892
10006
  } catch (const std::exception & err) {
@@ -9895,7 +10009,7 @@ int llama_apply_lora_from_file(struct llama_context * ctx, const char * path_lor
9895
10009
  }
9896
10010
  }
9897
10011
 
9898
- int llama_model_apply_lora_from_file(const struct llama_model * model, const char * path_lora, float scale, const char * path_base_model, int n_threads) {
10012
+ int32_t llama_model_apply_lora_from_file(const struct llama_model * model, const char * path_lora, float scale, const char * path_base_model, int32_t n_threads) {
9899
10013
  try {
9900
10014
  return llama_apply_lora_from_file_internal(*model, path_lora, scale, path_base_model, n_threads);
9901
10015
  } catch (const std::exception & err) {
@@ -9993,7 +10107,7 @@ void llama_kv_cache_view_update(const struct llama_context * ctx, struct llama_k
9993
10107
  }
9994
10108
  }
9995
10109
 
9996
- int llama_get_kv_cache_token_count(const struct llama_context * ctx) {
10110
+ int32_t llama_get_kv_cache_token_count(const struct llama_context * ctx) {
9997
10111
  int result = 0;
9998
10112
 
9999
10113
  for (uint32_t i = 0; i < ctx->kv_self.size; i++) {
@@ -10003,7 +10117,7 @@ int llama_get_kv_cache_token_count(const struct llama_context * ctx) {
10003
10117
  return result;
10004
10118
  }
10005
10119
 
10006
- int llama_get_kv_cache_used_cells(const struct llama_context * ctx) {
10120
+ int32_t llama_get_kv_cache_used_cells(const struct llama_context * ctx) {
10007
10121
  return ctx->kv_self.used;
10008
10122
  }
10009
10123
 
@@ -10167,9 +10281,10 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat
10167
10281
  const auto & hparams = ctx->model.hparams;
10168
10282
  const auto & cparams = ctx->cparams;
10169
10283
 
10170
- const auto n_layer = hparams.n_layer;
10171
- const auto n_embd = hparams.n_embd_gqa();
10172
- const auto n_ctx = cparams.n_ctx;
10284
+ const auto n_layer = hparams.n_layer;
10285
+ const auto n_embd_k_gqa = hparams.n_embd_k_gqa();
10286
+ const auto n_embd_v_gqa = hparams.n_embd_v_gqa();
10287
+ const auto n_ctx = cparams.n_ctx;
10173
10288
 
10174
10289
  const size_t kv_buf_size = ggml_backend_buffer_get_size(kv_self.buf);
10175
10290
  const uint32_t kv_head = kv_self.head;
@@ -10191,15 +10306,15 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat
10191
10306
  std::vector<struct ggml_tensor *> vout2d(n_layer);
10192
10307
 
10193
10308
  for (int il = 0; il < (int) n_layer; ++il) {
10194
- kout2d[il] = ggml_new_tensor_2d(cpy_ctx, kv_self.k_l[il]->type, n_embd, kv_head);
10195
- vout2d[il] = ggml_new_tensor_2d(cpy_ctx, kv_self.v_l[il]->type, kv_head, n_embd);
10309
+ kout2d[il] = ggml_new_tensor_2d(cpy_ctx, kv_self.k_l[il]->type, n_embd_k_gqa, kv_head);
10310
+ vout2d[il] = ggml_new_tensor_2d(cpy_ctx, kv_self.v_l[il]->type, kv_head, n_embd_v_gqa);
10196
10311
 
10197
10312
  ggml_tensor * k2d = ggml_view_2d(cpy_ctx, kv_self.k_l[il],
10198
- n_embd, kv_head,
10199
- elt_size*n_embd, 0);
10313
+ n_embd_k_gqa, kv_head,
10314
+ elt_size*n_embd_k_gqa, 0);
10200
10315
 
10201
10316
  ggml_tensor * v2d = ggml_view_2d(cpy_ctx, kv_self.v_l[il],
10202
- kv_head, n_embd,
10317
+ kv_head, n_embd_v_gqa,
10203
10318
  elt_size*n_ctx, 0);
10204
10319
 
10205
10320
  ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, k2d, kout2d[il]));
@@ -10306,9 +10421,10 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
10306
10421
  const auto & hparams = ctx->model.hparams;
10307
10422
  const auto & cparams = ctx->cparams;
10308
10423
 
10309
- const int n_layer = hparams.n_layer;
10310
- const int n_embd = hparams.n_embd_gqa();
10311
- const int n_ctx = cparams.n_ctx;
10424
+ const int n_layer = hparams.n_layer;
10425
+ const int n_embd_k_gqa = hparams.n_embd_k_gqa();
10426
+ const int n_embd_v_gqa = hparams.n_embd_v_gqa();
10427
+ const int n_ctx = cparams.n_ctx;
10312
10428
 
10313
10429
  size_t kv_buf_size;
10314
10430
  uint32_t kv_head;
@@ -10332,15 +10448,15 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
10332
10448
  std::vector<struct ggml_tensor *> vin2d(n_layer);
10333
10449
 
10334
10450
  for (int il = 0; il < n_layer; ++il) {
10335
- kin2d[il] = ggml_new_tensor_2d(cpy_ctx, kv_self.k_l[il]->type, n_embd, kv_head);
10336
- vin2d[il] = ggml_new_tensor_2d(cpy_ctx, kv_self.v_l[il]->type, kv_head, n_embd);
10451
+ kin2d[il] = ggml_new_tensor_2d(cpy_ctx, kv_self.k_l[il]->type, n_embd_k_gqa, kv_head);
10452
+ vin2d[il] = ggml_new_tensor_2d(cpy_ctx, kv_self.v_l[il]->type, kv_head, n_embd_v_gqa);
10337
10453
 
10338
10454
  ggml_tensor * k2d = ggml_view_2d(cpy_ctx, kv_self.k_l[il],
10339
- n_embd, kv_head,
10340
- elt_size*n_embd, 0);
10455
+ n_embd_k_gqa, kv_head,
10456
+ elt_size*n_embd_k_gqa, 0);
10341
10457
 
10342
10458
  ggml_tensor * v2d = ggml_view_2d(cpy_ctx, kv_self.v_l[il],
10343
- kv_head, n_embd,
10459
+ kv_head, n_embd_v_gqa,
10344
10460
  elt_size*n_ctx, 0);
10345
10461
 
10346
10462
  ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, kin2d[il], k2d));
@@ -10483,7 +10599,7 @@ int llama_eval(
10483
10599
  struct llama_context * ctx,
10484
10600
  llama_token * tokens,
10485
10601
  int32_t n_tokens,
10486
- int n_past) {
10602
+ int32_t n_past) {
10487
10603
  llama_kv_cache_seq_rm(ctx->kv_self, -1, n_past, -1);
10488
10604
 
10489
10605
  const int ret = llama_decode_internal(*ctx, llama_batch_get_one(tokens, n_tokens, n_past, 0));
@@ -10498,7 +10614,7 @@ int llama_eval_embd(
10498
10614
  struct llama_context * ctx,
10499
10615
  float * embd,
10500
10616
  int32_t n_tokens,
10501
- int n_past) {
10617
+ int32_t n_past) {
10502
10618
  llama_kv_cache_seq_rm(ctx->kv_self, -1, n_past, -1);
10503
10619
 
10504
10620
  llama_batch batch = { n_tokens, nullptr, embd, nullptr, nullptr, nullptr, nullptr, n_past, 1, 0, };
@@ -10569,7 +10685,7 @@ void llama_batch_free(struct llama_batch batch) {
10569
10685
  if (batch.logits) free(batch.logits);
10570
10686
  }
10571
10687
 
10572
- int llama_decode(
10688
+ int32_t llama_decode(
10573
10689
  struct llama_context * ctx,
10574
10690
  struct llama_batch batch) {
10575
10691
  const int ret = llama_decode_internal(*ctx, batch);
@@ -10617,11 +10733,11 @@ llama_token llama_token_nl(const struct llama_model * model) {
10617
10733
  return model->vocab.linefeed_id;
10618
10734
  }
10619
10735
 
10620
- int llama_add_bos_token(const struct llama_model * model) {
10736
+ int32_t llama_add_bos_token(const struct llama_model * model) {
10621
10737
  return model->vocab.special_add_bos;
10622
10738
  }
10623
10739
 
10624
- int llama_add_eos_token(const struct llama_model * model) {
10740
+ int32_t llama_add_eos_token(const struct llama_model * model) {
10625
10741
  return model->vocab.special_add_eos;
10626
10742
  }
10627
10743
 
@@ -10641,12 +10757,12 @@ llama_token llama_token_eot(const struct llama_model * model) {
10641
10757
  return model->vocab.special_eot_id;
10642
10758
  }
10643
10759
 
10644
- int llama_tokenize(
10760
+ int32_t llama_tokenize(
10645
10761
  const struct llama_model * model,
10646
10762
  const char * text,
10647
- int text_len,
10763
+ int32_t text_len,
10648
10764
  llama_token * tokens,
10649
- int n_max_tokens,
10765
+ int32_t n_max_tokens,
10650
10766
  bool add_bos,
10651
10767
  bool special) {
10652
10768
  auto res = llama_tokenize_internal(model->vocab, std::string(text, text_len), add_bos, special);
@@ -10674,7 +10790,7 @@ static std::string llama_decode_text(const std::string & text) {
10674
10790
  }
10675
10791
 
10676
10792
  // does not write null-terminator to buf
10677
- int llama_token_to_piece(const struct llama_model * model, llama_token token, char * buf, int length) {
10793
+ int32_t llama_token_to_piece(const struct llama_model * model, llama_token token, char * buf, int32_t length) {
10678
10794
  if (0 <= token && token < llama_n_vocab(model)) {
10679
10795
  switch (llama_vocab_get_type(model->vocab)) {
10680
10796
  case LLAMA_VOCAB_TYPE_SPM: {
@@ -10775,6 +10891,7 @@ const char * llama_print_system_info(void) {
10775
10891
 
10776
10892
  s = "";
10777
10893
  s += "AVX = " + std::to_string(ggml_cpu_has_avx()) + " | ";
10894
+ s += "AVX_VNNI = " + std::to_string(ggml_cpu_has_avx_vnni()) + " | ";
10778
10895
  s += "AVX2 = " + std::to_string(ggml_cpu_has_avx2()) + " | ";
10779
10896
  s += "AVX512 = " + std::to_string(ggml_cpu_has_avx512()) + " | ";
10780
10897
  s += "AVX512_VBMI = " + std::to_string(ggml_cpu_has_avx512_vbmi()) + " | ";