llama_cpp 0.10.3 → 0.10.4
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +4 -0
- data/LICENSE.txt +1 -1
- data/ext/llama_cpp/src/ggml-backend.c +6 -2
- data/ext/llama_cpp/src/ggml-cuda.cu +73 -63
- data/ext/llama_cpp/src/ggml-impl.h +1 -0
- data/ext/llama_cpp/src/ggml-metal.m +43 -20
- data/ext/llama_cpp/src/ggml-metal.metal +464 -245
- data/ext/llama_cpp/src/ggml-opencl.h +9 -9
- data/ext/llama_cpp/src/ggml-quants.c +61 -57
- data/ext/llama_cpp/src/ggml.c +171 -5
- data/ext/llama_cpp/src/ggml.h +1 -0
- data/ext/llama_cpp/src/llama.cpp +222 -105
- data/ext/llama_cpp/src/llama.h +31 -32
- data/lib/llama_cpp/version.rb +2 -2
- metadata +3 -3
data/ext/llama_cpp/src/llama.cpp
CHANGED
@@ -245,6 +245,8 @@ enum llm_kv {
|
|
245
245
|
LLM_KV_ATTENTION_HEAD_COUNT_KV,
|
246
246
|
LLM_KV_ATTENTION_MAX_ALIBI_BIAS,
|
247
247
|
LLM_KV_ATTENTION_CLAMP_KQV,
|
248
|
+
LLM_KV_ATTENTION_KEY_LENGTH,
|
249
|
+
LLM_KV_ATTENTION_VALUE_LENGTH,
|
248
250
|
LLM_KV_ATTENTION_LAYERNORM_EPS,
|
249
251
|
LLM_KV_ATTENTION_LAYERNORM_RMS_EPS,
|
250
252
|
|
@@ -297,6 +299,8 @@ static std::map<llm_kv, std::string> LLM_KV_NAMES = {
|
|
297
299
|
{ LLM_KV_ATTENTION_HEAD_COUNT_KV, "%s.attention.head_count_kv" },
|
298
300
|
{ LLM_KV_ATTENTION_MAX_ALIBI_BIAS, "%s.attention.max_alibi_bias" },
|
299
301
|
{ LLM_KV_ATTENTION_CLAMP_KQV, "%s.attention.clamp_kqv" },
|
302
|
+
{ LLM_KV_ATTENTION_KEY_LENGTH, "%s.attention.key_length" },
|
303
|
+
{ LLM_KV_ATTENTION_VALUE_LENGTH, "%s.attention.value_length" },
|
300
304
|
{ LLM_KV_ATTENTION_LAYERNORM_EPS, "%s.attention.layer_norm_epsilon" },
|
301
305
|
{ LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, "%s.attention.layer_norm_rms_epsilon" },
|
302
306
|
|
@@ -1279,6 +1283,8 @@ struct llama_hparams {
|
|
1279
1283
|
uint32_t n_head_kv;
|
1280
1284
|
uint32_t n_layer;
|
1281
1285
|
uint32_t n_rot;
|
1286
|
+
uint32_t n_embd_head_k; // dimension of keys (d_k). d_q is assumed to be the same, but there are n_head q heads, and only n_head_kv k-v heads
|
1287
|
+
uint32_t n_embd_head_v; // dimension of values (d_v) aka n_embd_head
|
1282
1288
|
uint32_t n_ff;
|
1283
1289
|
uint32_t n_expert = 0;
|
1284
1290
|
uint32_t n_expert_used = 0;
|
@@ -1305,6 +1311,8 @@ struct llama_hparams {
|
|
1305
1311
|
if (this->n_head_kv != other.n_head_kv) return true;
|
1306
1312
|
if (this->n_layer != other.n_layer) return true;
|
1307
1313
|
if (this->n_rot != other.n_rot) return true;
|
1314
|
+
if (this->n_embd_head_k != other.n_embd_head_k) return true;
|
1315
|
+
if (this->n_embd_head_v != other.n_embd_head_v) return true;
|
1308
1316
|
if (this->n_ff != other.n_ff) return true;
|
1309
1317
|
if (this->n_expert != other.n_expert) return true;
|
1310
1318
|
if (this->n_expert_used != other.n_expert_used) return true;
|
@@ -1326,12 +1334,12 @@ struct llama_hparams {
|
|
1326
1334
|
return n_head/n_head_kv;
|
1327
1335
|
}
|
1328
1336
|
|
1329
|
-
uint32_t
|
1330
|
-
return
|
1337
|
+
uint32_t n_embd_k_gqa() const { // dimension of key embeddings across all k-v heads
|
1338
|
+
return n_embd_head_k * n_head_kv;
|
1331
1339
|
}
|
1332
1340
|
|
1333
|
-
uint32_t
|
1334
|
-
return
|
1341
|
+
uint32_t n_embd_v_gqa() const { // dimension of value embeddings across all k-v heads
|
1342
|
+
return n_embd_head_v * n_head_kv;
|
1335
1343
|
}
|
1336
1344
|
};
|
1337
1345
|
|
@@ -1640,8 +1648,9 @@ static bool llama_kv_cache_init(
|
|
1640
1648
|
uint32_t n_ctx,
|
1641
1649
|
int n_gpu_layers,
|
1642
1650
|
bool offload) {
|
1643
|
-
const uint32_t
|
1644
|
-
const uint32_t
|
1651
|
+
const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa();
|
1652
|
+
const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa();
|
1653
|
+
const uint32_t n_layer = hparams.n_layer;
|
1645
1654
|
|
1646
1655
|
cache.has_shift = false;
|
1647
1656
|
|
@@ -1672,8 +1681,8 @@ static bool llama_kv_cache_init(
|
|
1672
1681
|
const int i_gpu_start = (int) n_layer - n_gpu_layers;
|
1673
1682
|
|
1674
1683
|
for (int i = 0; i < (int) n_layer; i++) {
|
1675
|
-
ggml_tensor * k = ggml_new_tensor_1d(cache.ctx, ktype,
|
1676
|
-
ggml_tensor * v = ggml_new_tensor_1d(cache.ctx, vtype,
|
1684
|
+
ggml_tensor * k = ggml_new_tensor_1d(cache.ctx, ktype, n_embd_k_gqa*n_ctx);
|
1685
|
+
ggml_tensor * v = ggml_new_tensor_1d(cache.ctx, vtype, n_embd_v_gqa*n_ctx);
|
1677
1686
|
ggml_format_name(k, "cache_k_l%d", i);
|
1678
1687
|
ggml_format_name(v, "cache_v_l%d", i);
|
1679
1688
|
cache.k_l.push_back(k);
|
@@ -2667,6 +2676,12 @@ static void llm_load_hparams(
|
|
2667
2676
|
// gpt-j n_rot = rotary_dim
|
2668
2677
|
}
|
2669
2678
|
|
2679
|
+
hparams.n_embd_head_k = hparams.n_embd / hparams.n_head;
|
2680
|
+
ml.get_key(LLM_KV_ATTENTION_KEY_LENGTH, hparams.n_embd_head_k, false);
|
2681
|
+
|
2682
|
+
hparams.n_embd_head_v = hparams.n_embd / hparams.n_head;
|
2683
|
+
ml.get_key(LLM_KV_ATTENTION_VALUE_LENGTH, hparams.n_embd_head_v, false);
|
2684
|
+
|
2670
2685
|
// arch-specific KVs
|
2671
2686
|
switch (model.arch) {
|
2672
2687
|
case LLM_ARCH_LLAMA:
|
@@ -3077,8 +3092,12 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
|
|
3077
3092
|
LLAMA_LOG_INFO("%s: n_head = %u\n", __func__, hparams.n_head);
|
3078
3093
|
LLAMA_LOG_INFO("%s: n_head_kv = %u\n", __func__, hparams.n_head_kv);
|
3079
3094
|
LLAMA_LOG_INFO("%s: n_layer = %u\n", __func__, hparams.n_layer);
|
3080
|
-
LLAMA_LOG_INFO("%s: n_rot = %u\n", __func__, hparams.n_rot);
|
3095
|
+
LLAMA_LOG_INFO("%s: n_rot = %u\n", __func__, hparams.n_rot);
|
3096
|
+
LLAMA_LOG_INFO("%s: n_embd_head_k = %u\n", __func__, hparams.n_embd_head_k);
|
3097
|
+
LLAMA_LOG_INFO("%s: n_embd_head_v = %u\n", __func__, hparams.n_embd_head_v);
|
3081
3098
|
LLAMA_LOG_INFO("%s: n_gqa = %u\n", __func__, hparams.n_gqa());
|
3099
|
+
LLAMA_LOG_INFO("%s: n_embd_k_gqa = %u\n", __func__, hparams.n_embd_k_gqa());
|
3100
|
+
LLAMA_LOG_INFO("%s: n_embd_v_gqa = %u\n", __func__, hparams.n_embd_v_gqa());
|
3082
3101
|
LLAMA_LOG_INFO("%s: f_norm_eps = %.1e\n", __func__, hparams.f_norm_eps);
|
3083
3102
|
LLAMA_LOG_INFO("%s: f_norm_rms_eps = %.1e\n", __func__, hparams.f_norm_rms_eps);
|
3084
3103
|
LLAMA_LOG_INFO("%s: f_clamp_kqv = %.1e\n", __func__, hparams.f_clamp_kqv);
|
@@ -3168,10 +3187,11 @@ static bool llm_load_tensors(
|
|
3168
3187
|
|
3169
3188
|
// create tensors for the weights
|
3170
3189
|
{
|
3171
|
-
const int64_t n_embd
|
3172
|
-
const int64_t
|
3173
|
-
const int64_t
|
3174
|
-
const int64_t
|
3190
|
+
const int64_t n_embd = hparams.n_embd;
|
3191
|
+
const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa();
|
3192
|
+
const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa();
|
3193
|
+
const int64_t n_layer = hparams.n_layer;
|
3194
|
+
const int64_t n_vocab = hparams.n_vocab;
|
3175
3195
|
|
3176
3196
|
const auto tn = LLM_TN(model.arch);
|
3177
3197
|
switch (model.arch) {
|
@@ -3197,7 +3217,10 @@ static bool llm_load_tensors(
|
|
3197
3217
|
model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
|
3198
3218
|
}
|
3199
3219
|
|
3200
|
-
const uint32_t n_ff
|
3220
|
+
const uint32_t n_ff = hparams.n_ff;
|
3221
|
+
const int64_t n_embd_gqa = n_embd_v_gqa;
|
3222
|
+
GGML_ASSERT(n_embd_gqa == n_embd / hparams.n_gqa());
|
3223
|
+
GGML_ASSERT(n_embd_gqa == n_embd_k_gqa);
|
3201
3224
|
|
3202
3225
|
const int i_gpu_start = n_layer - n_gpu_layers;
|
3203
3226
|
|
@@ -3265,7 +3288,10 @@ static bool llm_load_tensors(
|
|
3265
3288
|
model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
|
3266
3289
|
}
|
3267
3290
|
|
3268
|
-
const uint32_t n_ff
|
3291
|
+
const uint32_t n_ff = hparams.n_ff;
|
3292
|
+
const int64_t n_embd_gqa = n_embd_v_gqa;
|
3293
|
+
GGML_ASSERT(n_embd_gqa == n_embd / hparams.n_gqa());
|
3294
|
+
GGML_ASSERT(n_embd_gqa == n_embd_k_gqa);
|
3269
3295
|
|
3270
3296
|
const int i_gpu_start = n_layer - n_gpu_layers;
|
3271
3297
|
|
@@ -3313,7 +3339,10 @@ static bool llm_load_tensors(
|
|
3313
3339
|
model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
|
3314
3340
|
}
|
3315
3341
|
|
3316
|
-
const uint32_t n_ff
|
3342
|
+
const uint32_t n_ff = hparams.n_ff;
|
3343
|
+
const int64_t n_embd_gqa = n_embd_v_gqa;
|
3344
|
+
GGML_ASSERT(n_embd_gqa == n_embd / hparams.n_gqa());
|
3345
|
+
GGML_ASSERT(n_embd_gqa == n_embd_k_gqa);
|
3317
3346
|
|
3318
3347
|
const int i_gpu_start = n_layer - n_gpu_layers;
|
3319
3348
|
|
@@ -3363,7 +3392,10 @@ static bool llm_load_tensors(
|
|
3363
3392
|
model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
|
3364
3393
|
}
|
3365
3394
|
|
3366
|
-
const uint32_t n_ff
|
3395
|
+
const uint32_t n_ff = hparams.n_ff;
|
3396
|
+
const int64_t n_embd_gqa = n_embd_v_gqa;
|
3397
|
+
GGML_ASSERT(n_embd_gqa == n_embd / hparams.n_gqa());
|
3398
|
+
GGML_ASSERT(n_embd_gqa == n_embd_k_gqa);
|
3367
3399
|
|
3368
3400
|
const int i_gpu_start = n_layer - n_gpu_layers;
|
3369
3401
|
|
@@ -3415,7 +3447,11 @@ static bool llm_load_tensors(
|
|
3415
3447
|
model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
|
3416
3448
|
}
|
3417
3449
|
|
3418
|
-
const uint32_t n_ff
|
3450
|
+
const uint32_t n_ff = hparams.n_ff;
|
3451
|
+
const int64_t n_embd_gqa = n_embd_v_gqa;
|
3452
|
+
GGML_ASSERT(n_embd_gqa == n_embd / hparams.n_gqa());
|
3453
|
+
GGML_ASSERT(n_embd_gqa == n_embd_k_gqa);
|
3454
|
+
|
3419
3455
|
const int i_gpu_start = n_layer - n_gpu_layers;
|
3420
3456
|
model.layers.resize(n_layer);
|
3421
3457
|
for (uint32_t i = 0; i < n_layer; ++i) {
|
@@ -3464,7 +3500,10 @@ static bool llm_load_tensors(
|
|
3464
3500
|
model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
|
3465
3501
|
}
|
3466
3502
|
|
3467
|
-
const uint32_t n_ff
|
3503
|
+
const uint32_t n_ff = hparams.n_ff;
|
3504
|
+
const int64_t n_embd_gqa = n_embd_v_gqa;
|
3505
|
+
GGML_ASSERT(n_embd_gqa == n_embd / hparams.n_gqa());
|
3506
|
+
GGML_ASSERT(n_embd_gqa == n_embd_k_gqa);
|
3468
3507
|
|
3469
3508
|
const int i_gpu_start = n_layer - n_gpu_layers;
|
3470
3509
|
|
@@ -3515,7 +3554,10 @@ static bool llm_load_tensors(
|
|
3515
3554
|
model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
|
3516
3555
|
}
|
3517
3556
|
|
3518
|
-
const uint32_t n_ff
|
3557
|
+
const uint32_t n_ff = hparams.n_ff;
|
3558
|
+
const int64_t n_embd_gqa = n_embd_v_gqa;
|
3559
|
+
GGML_ASSERT(n_embd_gqa == n_embd / hparams.n_gqa());
|
3560
|
+
GGML_ASSERT(n_embd_gqa == n_embd_k_gqa);
|
3519
3561
|
|
3520
3562
|
const int i_gpu_start = n_layer - n_gpu_layers;
|
3521
3563
|
|
@@ -3562,7 +3604,10 @@ static bool llm_load_tensors(
|
|
3562
3604
|
model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
|
3563
3605
|
}
|
3564
3606
|
|
3565
|
-
const uint32_t n_ff
|
3607
|
+
const uint32_t n_ff = hparams.n_ff;
|
3608
|
+
const int64_t n_embd_gqa = n_embd_v_gqa;
|
3609
|
+
GGML_ASSERT(n_embd_gqa == n_embd / hparams.n_gqa());
|
3610
|
+
GGML_ASSERT(n_embd_gqa == n_embd_k_gqa);
|
3566
3611
|
|
3567
3612
|
const int i_gpu_start = n_layer - n_gpu_layers;
|
3568
3613
|
|
@@ -3660,7 +3705,10 @@ static bool llm_load_tensors(
|
|
3660
3705
|
model.output_b = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "bias"), {n_vocab}, backend_output);
|
3661
3706
|
}
|
3662
3707
|
|
3663
|
-
const uint32_t n_ff
|
3708
|
+
const uint32_t n_ff = hparams.n_ff;
|
3709
|
+
const int64_t n_embd_gqa = n_embd_v_gqa;
|
3710
|
+
GGML_ASSERT(n_embd_gqa == n_embd / hparams.n_gqa());
|
3711
|
+
GGML_ASSERT(n_embd_gqa == n_embd_k_gqa);
|
3664
3712
|
|
3665
3713
|
const int i_gpu_start = n_layer - n_gpu_layers;
|
3666
3714
|
|
@@ -3709,7 +3757,10 @@ static bool llm_load_tensors(
|
|
3709
3757
|
model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
|
3710
3758
|
}
|
3711
3759
|
|
3712
|
-
const uint32_t n_ff
|
3760
|
+
const uint32_t n_ff = hparams.n_ff;
|
3761
|
+
const int64_t n_embd_gqa = n_embd_v_gqa;
|
3762
|
+
GGML_ASSERT(n_embd_gqa == n_embd / hparams.n_gqa());
|
3763
|
+
GGML_ASSERT(n_embd_gqa == n_embd_k_gqa);
|
3713
3764
|
|
3714
3765
|
const int i_gpu_start = n_layer - n_gpu_layers;
|
3715
3766
|
|
@@ -3756,7 +3807,10 @@ static bool llm_load_tensors(
|
|
3756
3807
|
model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
|
3757
3808
|
}
|
3758
3809
|
|
3759
|
-
const uint32_t n_ff
|
3810
|
+
const uint32_t n_ff = hparams.n_ff;
|
3811
|
+
const int64_t n_embd_gqa = n_embd_v_gqa;
|
3812
|
+
GGML_ASSERT(n_embd_gqa == n_embd / hparams.n_gqa());
|
3813
|
+
GGML_ASSERT(n_embd_gqa == n_embd_k_gqa);
|
3760
3814
|
|
3761
3815
|
const int i_gpu_start = n_layer - n_gpu_layers;
|
3762
3816
|
|
@@ -3995,8 +4049,8 @@ static struct ggml_tensor * llm_build_inp_embd(
|
|
3995
4049
|
return inpL;
|
3996
4050
|
}
|
3997
4051
|
|
3998
|
-
// Persimmon: n_rot =
|
3999
|
-
// Other: n_rot =
|
4052
|
+
// Persimmon: n_rot = n_embd_head_k/2
|
4053
|
+
// Other: n_rot = n_embd_head_k
|
4000
4054
|
static void llm_build_k_shift(
|
4001
4055
|
struct ggml_context * ctx,
|
4002
4056
|
const llama_hparams & hparams,
|
@@ -4009,17 +4063,17 @@ static void llm_build_k_shift(
|
|
4009
4063
|
float freq_base,
|
4010
4064
|
float freq_scale,
|
4011
4065
|
const llm_build_cb & cb) {
|
4012
|
-
const int64_t n_layer
|
4013
|
-
const int64_t n_head_kv
|
4014
|
-
const int64_t
|
4015
|
-
const int64_t
|
4016
|
-
const int32_t n_orig_ctx
|
4017
|
-
const float ext_factor
|
4018
|
-
const float attn_factor
|
4019
|
-
const float beta_fast
|
4020
|
-
const float beta_slow
|
4021
|
-
|
4022
|
-
GGML_ASSERT(
|
4066
|
+
const int64_t n_layer = hparams.n_layer;
|
4067
|
+
const int64_t n_head_kv = hparams.n_head_kv;
|
4068
|
+
const int64_t n_embd_head_k = hparams.n_embd_head_k;
|
4069
|
+
const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa();
|
4070
|
+
const int32_t n_orig_ctx = cparams.n_yarn_orig_ctx;
|
4071
|
+
const float ext_factor = cparams.yarn_ext_factor;
|
4072
|
+
const float attn_factor = cparams.yarn_attn_factor;
|
4073
|
+
const float beta_fast = cparams.yarn_beta_fast;
|
4074
|
+
const float beta_slow = cparams.yarn_beta_slow;
|
4075
|
+
|
4076
|
+
GGML_ASSERT(n_embd_head_k % n_rot == 0);
|
4023
4077
|
|
4024
4078
|
struct ggml_tensor * K_shift = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, n_ctx);
|
4025
4079
|
cb(K_shift, "K_shift", -1);
|
@@ -4037,9 +4091,9 @@ static void llm_build_k_shift(
|
|
4037
4091
|
// we rotate only the first n_rot dimensions
|
4038
4092
|
ggml_rope_custom_inplace(ctx,
|
4039
4093
|
ggml_view_3d(ctx, kv.k_l[il],
|
4040
|
-
|
4041
|
-
ggml_row_size(kv.k_l[il]->type,
|
4042
|
-
ggml_row_size(kv.k_l[il]->type,
|
4094
|
+
n_embd_head_k, n_head_kv, n_ctx,
|
4095
|
+
ggml_row_size(kv.k_l[il]->type, n_embd_head_k),
|
4096
|
+
ggml_row_size(kv.k_l[il]->type, n_embd_k_gqa),
|
4043
4097
|
0),
|
4044
4098
|
K_shift, n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
4045
4099
|
ext_factor, attn_factor, beta_fast, beta_slow);
|
@@ -4060,18 +4114,19 @@ static void llm_build_kv_store(
|
|
4060
4114
|
int32_t kv_head,
|
4061
4115
|
const llm_build_cb & cb,
|
4062
4116
|
int64_t il) {
|
4063
|
-
const int64_t
|
4117
|
+
const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa();
|
4118
|
+
const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa();
|
4064
4119
|
|
4065
4120
|
// compute the transposed [n_tokens, n_embd] V matrix
|
4066
|
-
struct ggml_tensor * v_cur_t = ggml_transpose(ctx, ggml_reshape_2d(ctx, v_cur,
|
4121
|
+
struct ggml_tensor * v_cur_t = ggml_transpose(ctx, ggml_reshape_2d(ctx, v_cur, n_embd_v_gqa, n_tokens));
|
4067
4122
|
//struct ggml_tensor * v_cur_t = ggml_transpose(ctx, v_cur); // TODO: reshape above is likely not needed
|
4068
4123
|
cb(v_cur_t, "v_cur_t", il);
|
4069
4124
|
|
4070
|
-
struct ggml_tensor * k_cache_view = ggml_view_1d(ctx, kv.k_l[il], n_tokens*
|
4071
|
-
(ggml_row_size(kv.k_l[il]->type,
|
4125
|
+
struct ggml_tensor * k_cache_view = ggml_view_1d(ctx, kv.k_l[il], n_tokens*n_embd_k_gqa,
|
4126
|
+
(ggml_row_size(kv.k_l[il]->type, n_embd_k_gqa))*kv_head);
|
4072
4127
|
cb(k_cache_view, "k_cache_view", il);
|
4073
4128
|
|
4074
|
-
struct ggml_tensor * v_cache_view = ggml_view_2d(ctx, kv.v_l[il], n_tokens,
|
4129
|
+
struct ggml_tensor * v_cache_view = ggml_view_2d(ctx, kv.v_l[il], n_tokens, n_embd_v_gqa,
|
4075
4130
|
( n_ctx)*ggml_element_size(kv.v_l[il]),
|
4076
4131
|
(kv_head)*ggml_element_size(kv.v_l[il]));
|
4077
4132
|
cb(v_cache_view, "v_cache_view", il);
|
@@ -4221,20 +4276,20 @@ static struct ggml_tensor * llm_build_kqv(
|
|
4221
4276
|
float kq_scale,
|
4222
4277
|
const llm_build_cb & cb,
|
4223
4278
|
int il) {
|
4224
|
-
const int64_t
|
4225
|
-
const int64_t
|
4226
|
-
const int64_t
|
4227
|
-
const int64_t
|
4228
|
-
const int64_t
|
4279
|
+
const int64_t n_head = hparams.n_head;
|
4280
|
+
const int64_t n_head_kv = hparams.n_head_kv;
|
4281
|
+
const int64_t n_embd_head_k = hparams.n_embd_head_k;
|
4282
|
+
const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa();
|
4283
|
+
const int64_t n_embd_head_v = hparams.n_embd_head_v;
|
4229
4284
|
|
4230
4285
|
struct ggml_tensor * q = ggml_permute(ctx, q_cur, 0, 2, 1, 3);
|
4231
4286
|
cb(q, "q", il);
|
4232
4287
|
|
4233
4288
|
struct ggml_tensor * k =
|
4234
4289
|
ggml_view_3d(ctx, kv.k_l[il],
|
4235
|
-
|
4236
|
-
ggml_row_size(kv.k_l[il]->type,
|
4237
|
-
ggml_row_size(kv.k_l[il]->type,
|
4290
|
+
n_embd_head_k, n_kv, n_head_kv,
|
4291
|
+
ggml_row_size(kv.k_l[il]->type, n_embd_k_gqa),
|
4292
|
+
ggml_row_size(kv.k_l[il]->type, n_embd_head_k),
|
4238
4293
|
0);
|
4239
4294
|
cb(k, "k", il);
|
4240
4295
|
|
@@ -4273,9 +4328,9 @@ static struct ggml_tensor * llm_build_kqv(
|
|
4273
4328
|
// split cached v into n_head heads
|
4274
4329
|
struct ggml_tensor * v =
|
4275
4330
|
ggml_view_3d(ctx, kv.v_l[il],
|
4276
|
-
n_kv,
|
4331
|
+
n_kv, n_embd_head_v, n_head_kv,
|
4277
4332
|
ggml_element_size(kv.v_l[il])*n_ctx,
|
4278
|
-
ggml_element_size(kv.v_l[il])*n_ctx*
|
4333
|
+
ggml_element_size(kv.v_l[il])*n_ctx*n_embd_head_v,
|
4279
4334
|
0);
|
4280
4335
|
cb(v, "v", il);
|
4281
4336
|
|
@@ -4285,7 +4340,7 @@ static struct ggml_tensor * llm_build_kqv(
|
|
4285
4340
|
struct ggml_tensor * kqv_merged = ggml_permute(ctx, kqv, 0, 2, 1, 3);
|
4286
4341
|
cb(kqv_merged, "kqv_merged", il);
|
4287
4342
|
|
4288
|
-
struct ggml_tensor * cur = ggml_cont_2d(ctx, kqv_merged,
|
4343
|
+
struct ggml_tensor * cur = ggml_cont_2d(ctx, kqv_merged, n_embd_head_k*n_head, n_tokens);
|
4289
4344
|
cb(cur, "kqv_merged_cont", il);
|
4290
4345
|
|
4291
4346
|
cur = ggml_mul_mat(ctx, wo, cur);
|
@@ -4312,8 +4367,10 @@ struct llm_build_context {
|
|
4312
4367
|
const int64_t n_ctx; // user-specified context size (can be different from n_ctx_train)
|
4313
4368
|
const int64_t n_head;
|
4314
4369
|
const int64_t n_head_kv;
|
4315
|
-
const int64_t
|
4316
|
-
const int64_t
|
4370
|
+
const int64_t n_embd_head_k;
|
4371
|
+
const int64_t n_embd_k_gqa;
|
4372
|
+
const int64_t n_embd_head_v;
|
4373
|
+
const int64_t n_embd_v_gqa;
|
4317
4374
|
const int64_t n_expert;
|
4318
4375
|
const int64_t n_expert_used;
|
4319
4376
|
|
@@ -4355,8 +4412,10 @@ struct llm_build_context {
|
|
4355
4412
|
n_ctx (cparams.n_ctx),
|
4356
4413
|
n_head (hparams.n_head),
|
4357
4414
|
n_head_kv (hparams.n_head_kv),
|
4358
|
-
|
4359
|
-
|
4415
|
+
n_embd_head_k (hparams.n_embd_head_k),
|
4416
|
+
n_embd_k_gqa (hparams.n_embd_k_gqa()),
|
4417
|
+
n_embd_head_v (hparams.n_embd_head_v),
|
4418
|
+
n_embd_v_gqa (hparams.n_embd_v_gqa()),
|
4360
4419
|
n_expert (hparams.n_expert),
|
4361
4420
|
n_expert_used (hparams.n_expert_used),
|
4362
4421
|
freq_base (cparams.rope_freq_base),
|
@@ -4399,6 +4458,8 @@ struct llm_build_context {
|
|
4399
4458
|
struct ggml_cgraph * build_llama() {
|
4400
4459
|
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
4401
4460
|
|
4461
|
+
const int64_t n_embd_head = hparams.n_embd_head_v;
|
4462
|
+
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
4402
4463
|
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
4403
4464
|
|
4404
4465
|
struct ggml_tensor * cur;
|
@@ -4583,6 +4644,9 @@ struct llm_build_context {
|
|
4583
4644
|
struct ggml_cgraph * build_baichuan() {
|
4584
4645
|
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
4585
4646
|
|
4647
|
+
const int64_t n_embd_head = hparams.n_embd_head_v;
|
4648
|
+
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
4649
|
+
|
4586
4650
|
struct ggml_tensor * cur;
|
4587
4651
|
struct ggml_tensor * inpL;
|
4588
4652
|
|
@@ -4700,6 +4764,11 @@ struct llm_build_context {
|
|
4700
4764
|
struct ggml_cgraph * build_falcon() {
|
4701
4765
|
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
4702
4766
|
|
4767
|
+
const int64_t n_embd_head = hparams.n_embd_head_v;
|
4768
|
+
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
|
4769
|
+
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
4770
|
+
GGML_ASSERT(n_embd_gqa == n_embd);
|
4771
|
+
|
4703
4772
|
struct ggml_tensor * cur;
|
4704
4773
|
struct ggml_tensor * inpL;
|
4705
4774
|
|
@@ -4819,6 +4888,11 @@ struct llm_build_context {
|
|
4819
4888
|
struct ggml_cgraph * build_starcoder() {
|
4820
4889
|
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
4821
4890
|
|
4891
|
+
const int64_t n_embd_head = hparams.n_embd_head_v;
|
4892
|
+
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
|
4893
|
+
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
4894
|
+
GGML_ASSERT(n_embd_gqa == n_embd);
|
4895
|
+
|
4822
4896
|
struct ggml_tensor * cur;
|
4823
4897
|
struct ggml_tensor * pos;
|
4824
4898
|
struct ggml_tensor * inpL;
|
@@ -4915,7 +4989,12 @@ struct llm_build_context {
|
|
4915
4989
|
struct ggml_cgraph * build_persimmon() {
|
4916
4990
|
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
4917
4991
|
|
4918
|
-
const int64_t
|
4992
|
+
const int64_t n_embd_head = hparams.n_embd_head_v;
|
4993
|
+
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
|
4994
|
+
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
4995
|
+
GGML_ASSERT(n_embd_gqa == n_embd);
|
4996
|
+
|
4997
|
+
const int64_t n_rot = n_embd_head_k / 2;
|
4919
4998
|
|
4920
4999
|
struct ggml_tensor * cur;
|
4921
5000
|
struct ggml_tensor * inpL;
|
@@ -5124,6 +5203,11 @@ struct llm_build_context {
|
|
5124
5203
|
struct ggml_cgraph * build_refact() {
|
5125
5204
|
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
5126
5205
|
|
5206
|
+
const int64_t n_embd_head = hparams.n_embd_head_v;
|
5207
|
+
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
|
5208
|
+
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
5209
|
+
GGML_ASSERT(n_embd_gqa == n_embd);
|
5210
|
+
|
5127
5211
|
struct ggml_tensor * cur;
|
5128
5212
|
struct ggml_tensor * inpL;
|
5129
5213
|
|
@@ -5212,6 +5296,11 @@ struct llm_build_context {
|
|
5212
5296
|
struct ggml_cgraph * build_bloom() {
|
5213
5297
|
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
5214
5298
|
|
5299
|
+
const int64_t n_embd_head = hparams.n_embd_head_v;
|
5300
|
+
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
|
5301
|
+
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
5302
|
+
GGML_ASSERT(n_embd_gqa == n_embd);
|
5303
|
+
|
5215
5304
|
struct ggml_tensor * cur;
|
5216
5305
|
struct ggml_tensor * inpL;
|
5217
5306
|
|
@@ -5303,6 +5392,11 @@ struct llm_build_context {
|
|
5303
5392
|
struct ggml_cgraph * build_mpt() {
|
5304
5393
|
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
5305
5394
|
|
5395
|
+
const int64_t n_embd_head = hparams.n_embd_head_v;
|
5396
|
+
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
|
5397
|
+
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
5398
|
+
GGML_ASSERT(n_embd_gqa == n_embd);
|
5399
|
+
|
5306
5400
|
struct ggml_tensor * cur;
|
5307
5401
|
struct ggml_tensor * inpL;
|
5308
5402
|
|
@@ -5398,6 +5492,9 @@ struct llm_build_context {
|
|
5398
5492
|
struct ggml_cgraph * build_stablelm() {
|
5399
5493
|
struct ggml_cgraph * gf = ggml_new_graph(ctx0);
|
5400
5494
|
|
5495
|
+
const int64_t n_embd_head = hparams.n_embd_head_v;
|
5496
|
+
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
5497
|
+
|
5401
5498
|
struct ggml_tensor * cur;
|
5402
5499
|
struct ggml_tensor * inpL;
|
5403
5500
|
|
@@ -5508,6 +5605,9 @@ struct llm_build_context {
|
|
5508
5605
|
struct ggml_cgraph * build_qwen() {
|
5509
5606
|
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
5510
5607
|
|
5608
|
+
const int64_t n_embd_head = hparams.n_embd_head_v;
|
5609
|
+
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
5610
|
+
|
5511
5611
|
struct ggml_tensor * cur;
|
5512
5612
|
struct ggml_tensor * inpL;
|
5513
5613
|
|
@@ -5619,6 +5719,11 @@ struct llm_build_context {
|
|
5619
5719
|
struct ggml_cgraph * build_phi2() {
|
5620
5720
|
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
5621
5721
|
|
5722
|
+
const int64_t n_embd_head = hparams.n_embd_head_v;
|
5723
|
+
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
|
5724
|
+
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
5725
|
+
GGML_ASSERT(n_embd_gqa == n_embd);
|
5726
|
+
|
5622
5727
|
struct ggml_tensor * cur;
|
5623
5728
|
struct ggml_tensor * attn_norm_output;
|
5624
5729
|
struct ggml_tensor * ffn_output;
|
@@ -5731,6 +5836,9 @@ struct llm_build_context {
|
|
5731
5836
|
struct ggml_cgraph * build_plamo() {
|
5732
5837
|
struct ggml_cgraph * gf = ggml_new_graph(ctx0);
|
5733
5838
|
|
5839
|
+
const int64_t n_embd_head = hparams.n_embd_head_v;
|
5840
|
+
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
5841
|
+
|
5734
5842
|
struct ggml_tensor * cur;
|
5735
5843
|
struct ggml_tensor * inpL;
|
5736
5844
|
|
@@ -5835,6 +5943,11 @@ struct llm_build_context {
|
|
5835
5943
|
struct ggml_cgraph * build_gpt2() {
|
5836
5944
|
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
5837
5945
|
|
5946
|
+
const int64_t n_embd_head = hparams.n_embd_head_v;
|
5947
|
+
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
|
5948
|
+
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
5949
|
+
GGML_ASSERT(n_embd_gqa == n_embd);
|
5950
|
+
|
5838
5951
|
struct ggml_tensor * cur;
|
5839
5952
|
struct ggml_tensor * pos;
|
5840
5953
|
struct ggml_tensor * inpL;
|
@@ -7912,7 +8025,7 @@ void llama_sample_softmax(struct llama_context * ctx, llama_token_data_array * c
|
|
7912
8025
|
}
|
7913
8026
|
}
|
7914
8027
|
|
7915
|
-
void llama_sample_top_k(struct llama_context * ctx, llama_token_data_array * candidates,
|
8028
|
+
void llama_sample_top_k(struct llama_context * ctx, llama_token_data_array * candidates, int32_t k, size_t min_keep) {
|
7916
8029
|
const int64_t t_start_sample_us = ggml_time_us();
|
7917
8030
|
|
7918
8031
|
k = std::max(k, (int) min_keep);
|
@@ -8272,7 +8385,7 @@ void llama_sample_classifier_free_guidance(
|
|
8272
8385
|
}
|
8273
8386
|
}
|
8274
8387
|
|
8275
|
-
llama_token llama_sample_token_mirostat(struct llama_context * ctx, llama_token_data_array * candidates, float tau, float eta,
|
8388
|
+
llama_token llama_sample_token_mirostat(struct llama_context * ctx, llama_token_data_array * candidates, float tau, float eta, int32_t m, float * mu) {
|
8276
8389
|
GGML_ASSERT(ctx);
|
8277
8390
|
|
8278
8391
|
auto N = float(llama_n_vocab(llama_get_model(ctx)));
|
@@ -9480,7 +9593,7 @@ struct llama_model_quantize_params llama_model_quantize_default_params() {
|
|
9480
9593
|
return result;
|
9481
9594
|
}
|
9482
9595
|
|
9483
|
-
|
9596
|
+
int32_t llama_max_devices(void) {
|
9484
9597
|
return LLAMA_MAX_DEVICES;
|
9485
9598
|
}
|
9486
9599
|
|
@@ -9622,8 +9735,8 @@ struct llama_context * llama_new_context_with_model(
|
|
9622
9735
|
const ggml_type type_k = params.type_k;
|
9623
9736
|
const ggml_type type_v = params.type_v;
|
9624
9737
|
|
9625
|
-
GGML_ASSERT(hparams.
|
9626
|
-
GGML_ASSERT(hparams.
|
9738
|
+
GGML_ASSERT(hparams.n_embd_head_k % ggml_blck_size(type_k) == 0);
|
9739
|
+
GGML_ASSERT(hparams.n_embd_head_v % ggml_blck_size(type_v) == 0);
|
9627
9740
|
|
9628
9741
|
// reserve memory for context buffers
|
9629
9742
|
if (!hparams.vocab_only) {
|
@@ -9791,15 +9904,15 @@ enum llama_vocab_type llama_vocab_type(const struct llama_model * model) {
|
|
9791
9904
|
return model->vocab.type;
|
9792
9905
|
}
|
9793
9906
|
|
9794
|
-
|
9907
|
+
int32_t llama_n_vocab(const struct llama_model * model) {
|
9795
9908
|
return model->vocab.id_to_token.size();
|
9796
9909
|
}
|
9797
9910
|
|
9798
|
-
|
9911
|
+
int32_t llama_n_ctx_train(const struct llama_model * model) {
|
9799
9912
|
return model->hparams.n_ctx_train;
|
9800
9913
|
}
|
9801
9914
|
|
9802
|
-
|
9915
|
+
int32_t llama_n_embd(const struct llama_model * model) {
|
9803
9916
|
return model->hparams.n_embd;
|
9804
9917
|
}
|
9805
9918
|
|
@@ -9807,7 +9920,7 @@ float llama_rope_freq_scale_train(const struct llama_model * model) {
|
|
9807
9920
|
return model->hparams.rope_freq_scale_train;
|
9808
9921
|
}
|
9809
9922
|
|
9810
|
-
|
9923
|
+
int32_t llama_model_meta_val_str(const struct llama_model * model, const char * key, char * buf, size_t buf_size) {
|
9811
9924
|
const auto & it = model->gguf_kv.find(key);
|
9812
9925
|
if (it == model->gguf_kv.end()) {
|
9813
9926
|
if (buf_size > 0) {
|
@@ -9818,11 +9931,11 @@ int llama_model_meta_val_str(const struct llama_model * model, const char * key,
|
|
9818
9931
|
return snprintf(buf, buf_size, "%s", it->second.c_str());
|
9819
9932
|
}
|
9820
9933
|
|
9821
|
-
|
9934
|
+
int32_t llama_model_meta_count(const struct llama_model * model) {
|
9822
9935
|
return (int)model->gguf_kv.size();
|
9823
9936
|
}
|
9824
9937
|
|
9825
|
-
|
9938
|
+
int32_t llama_model_meta_key_by_index(const struct llama_model * model, int i, char * buf, size_t buf_size) {
|
9826
9939
|
if (i < 0 || i >= (int)model->gguf_kv.size()) {
|
9827
9940
|
if (buf_size > 0) {
|
9828
9941
|
buf[0] = '\0';
|
@@ -9834,7 +9947,7 @@ int llama_model_meta_key_by_index(const struct llama_model * model, int i, char
|
|
9834
9947
|
return snprintf(buf, buf_size, "%s", it->first.c_str());
|
9835
9948
|
}
|
9836
9949
|
|
9837
|
-
|
9950
|
+
int32_t llama_model_meta_val_str_by_index(const struct llama_model * model, int32_t i, char * buf, size_t buf_size) {
|
9838
9951
|
if (i < 0 || i >= (int)model->gguf_kv.size()) {
|
9839
9952
|
if (buf_size > 0) {
|
9840
9953
|
buf[0] = '\0';
|
@@ -9846,9 +9959,10 @@ int llama_model_meta_val_str_by_index(const struct llama_model * model, int i, c
|
|
9846
9959
|
return snprintf(buf, buf_size, "%s", it->second.c_str());
|
9847
9960
|
}
|
9848
9961
|
|
9849
|
-
|
9850
|
-
return snprintf(buf, buf_size, "%s %s %s",
|
9962
|
+
int32_t llama_model_desc(const struct llama_model * model, char * buf, size_t buf_size) {
|
9963
|
+
return snprintf(buf, buf_size, "%s %s%s %s",
|
9851
9964
|
llama_model_arch_name(model->arch).c_str(),
|
9965
|
+
model->hparams.n_expert > 0 ? (std::to_string(model->hparams.n_expert) + "x").c_str() : "",
|
9852
9966
|
llama_model_type_name(model->type),
|
9853
9967
|
llama_model_ftype_name(model->ftype).c_str());
|
9854
9968
|
}
|
@@ -9873,7 +9987,7 @@ struct ggml_tensor * llama_get_model_tensor(struct llama_model * model, const ch
|
|
9873
9987
|
return ggml_get_tensor(model->ctx, name);
|
9874
9988
|
}
|
9875
9989
|
|
9876
|
-
|
9990
|
+
uint32_t llama_model_quantize(
|
9877
9991
|
const char * fname_inp,
|
9878
9992
|
const char * fname_out,
|
9879
9993
|
const llama_model_quantize_params * params) {
|
@@ -9886,7 +10000,7 @@ int llama_model_quantize(
|
|
9886
10000
|
}
|
9887
10001
|
}
|
9888
10002
|
|
9889
|
-
|
10003
|
+
int32_t llama_apply_lora_from_file(struct llama_context * ctx, const char * path_lora, float scale, const char * path_base_model, int32_t n_threads) {
|
9890
10004
|
try {
|
9891
10005
|
return llama_apply_lora_from_file_internal(ctx->model, path_lora, scale, path_base_model, n_threads);
|
9892
10006
|
} catch (const std::exception & err) {
|
@@ -9895,7 +10009,7 @@ int llama_apply_lora_from_file(struct llama_context * ctx, const char * path_lor
|
|
9895
10009
|
}
|
9896
10010
|
}
|
9897
10011
|
|
9898
|
-
|
10012
|
+
int32_t llama_model_apply_lora_from_file(const struct llama_model * model, const char * path_lora, float scale, const char * path_base_model, int32_t n_threads) {
|
9899
10013
|
try {
|
9900
10014
|
return llama_apply_lora_from_file_internal(*model, path_lora, scale, path_base_model, n_threads);
|
9901
10015
|
} catch (const std::exception & err) {
|
@@ -9993,7 +10107,7 @@ void llama_kv_cache_view_update(const struct llama_context * ctx, struct llama_k
|
|
9993
10107
|
}
|
9994
10108
|
}
|
9995
10109
|
|
9996
|
-
|
10110
|
+
int32_t llama_get_kv_cache_token_count(const struct llama_context * ctx) {
|
9997
10111
|
int result = 0;
|
9998
10112
|
|
9999
10113
|
for (uint32_t i = 0; i < ctx->kv_self.size; i++) {
|
@@ -10003,7 +10117,7 @@ int llama_get_kv_cache_token_count(const struct llama_context * ctx) {
|
|
10003
10117
|
return result;
|
10004
10118
|
}
|
10005
10119
|
|
10006
|
-
|
10120
|
+
int32_t llama_get_kv_cache_used_cells(const struct llama_context * ctx) {
|
10007
10121
|
return ctx->kv_self.used;
|
10008
10122
|
}
|
10009
10123
|
|
@@ -10167,9 +10281,10 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat
|
|
10167
10281
|
const auto & hparams = ctx->model.hparams;
|
10168
10282
|
const auto & cparams = ctx->cparams;
|
10169
10283
|
|
10170
|
-
const auto n_layer
|
10171
|
-
const auto
|
10172
|
-
const auto
|
10284
|
+
const auto n_layer = hparams.n_layer;
|
10285
|
+
const auto n_embd_k_gqa = hparams.n_embd_k_gqa();
|
10286
|
+
const auto n_embd_v_gqa = hparams.n_embd_v_gqa();
|
10287
|
+
const auto n_ctx = cparams.n_ctx;
|
10173
10288
|
|
10174
10289
|
const size_t kv_buf_size = ggml_backend_buffer_get_size(kv_self.buf);
|
10175
10290
|
const uint32_t kv_head = kv_self.head;
|
@@ -10191,15 +10306,15 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat
|
|
10191
10306
|
std::vector<struct ggml_tensor *> vout2d(n_layer);
|
10192
10307
|
|
10193
10308
|
for (int il = 0; il < (int) n_layer; ++il) {
|
10194
|
-
kout2d[il] = ggml_new_tensor_2d(cpy_ctx, kv_self.k_l[il]->type,
|
10195
|
-
vout2d[il] = ggml_new_tensor_2d(cpy_ctx, kv_self.v_l[il]->type, kv_head,
|
10309
|
+
kout2d[il] = ggml_new_tensor_2d(cpy_ctx, kv_self.k_l[il]->type, n_embd_k_gqa, kv_head);
|
10310
|
+
vout2d[il] = ggml_new_tensor_2d(cpy_ctx, kv_self.v_l[il]->type, kv_head, n_embd_v_gqa);
|
10196
10311
|
|
10197
10312
|
ggml_tensor * k2d = ggml_view_2d(cpy_ctx, kv_self.k_l[il],
|
10198
|
-
|
10199
|
-
elt_size*
|
10313
|
+
n_embd_k_gqa, kv_head,
|
10314
|
+
elt_size*n_embd_k_gqa, 0);
|
10200
10315
|
|
10201
10316
|
ggml_tensor * v2d = ggml_view_2d(cpy_ctx, kv_self.v_l[il],
|
10202
|
-
kv_head,
|
10317
|
+
kv_head, n_embd_v_gqa,
|
10203
10318
|
elt_size*n_ctx, 0);
|
10204
10319
|
|
10205
10320
|
ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, k2d, kout2d[il]));
|
@@ -10306,9 +10421,10 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
|
|
10306
10421
|
const auto & hparams = ctx->model.hparams;
|
10307
10422
|
const auto & cparams = ctx->cparams;
|
10308
10423
|
|
10309
|
-
const int n_layer
|
10310
|
-
const int
|
10311
|
-
const int
|
10424
|
+
const int n_layer = hparams.n_layer;
|
10425
|
+
const int n_embd_k_gqa = hparams.n_embd_k_gqa();
|
10426
|
+
const int n_embd_v_gqa = hparams.n_embd_v_gqa();
|
10427
|
+
const int n_ctx = cparams.n_ctx;
|
10312
10428
|
|
10313
10429
|
size_t kv_buf_size;
|
10314
10430
|
uint32_t kv_head;
|
@@ -10332,15 +10448,15 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
|
|
10332
10448
|
std::vector<struct ggml_tensor *> vin2d(n_layer);
|
10333
10449
|
|
10334
10450
|
for (int il = 0; il < n_layer; ++il) {
|
10335
|
-
kin2d[il] = ggml_new_tensor_2d(cpy_ctx, kv_self.k_l[il]->type,
|
10336
|
-
vin2d[il] = ggml_new_tensor_2d(cpy_ctx, kv_self.v_l[il]->type, kv_head,
|
10451
|
+
kin2d[il] = ggml_new_tensor_2d(cpy_ctx, kv_self.k_l[il]->type, n_embd_k_gqa, kv_head);
|
10452
|
+
vin2d[il] = ggml_new_tensor_2d(cpy_ctx, kv_self.v_l[il]->type, kv_head, n_embd_v_gqa);
|
10337
10453
|
|
10338
10454
|
ggml_tensor * k2d = ggml_view_2d(cpy_ctx, kv_self.k_l[il],
|
10339
|
-
|
10340
|
-
elt_size*
|
10455
|
+
n_embd_k_gqa, kv_head,
|
10456
|
+
elt_size*n_embd_k_gqa, 0);
|
10341
10457
|
|
10342
10458
|
ggml_tensor * v2d = ggml_view_2d(cpy_ctx, kv_self.v_l[il],
|
10343
|
-
kv_head,
|
10459
|
+
kv_head, n_embd_v_gqa,
|
10344
10460
|
elt_size*n_ctx, 0);
|
10345
10461
|
|
10346
10462
|
ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, kin2d[il], k2d));
|
@@ -10483,7 +10599,7 @@ int llama_eval(
|
|
10483
10599
|
struct llama_context * ctx,
|
10484
10600
|
llama_token * tokens,
|
10485
10601
|
int32_t n_tokens,
|
10486
|
-
|
10602
|
+
int32_t n_past) {
|
10487
10603
|
llama_kv_cache_seq_rm(ctx->kv_self, -1, n_past, -1);
|
10488
10604
|
|
10489
10605
|
const int ret = llama_decode_internal(*ctx, llama_batch_get_one(tokens, n_tokens, n_past, 0));
|
@@ -10498,7 +10614,7 @@ int llama_eval_embd(
|
|
10498
10614
|
struct llama_context * ctx,
|
10499
10615
|
float * embd,
|
10500
10616
|
int32_t n_tokens,
|
10501
|
-
|
10617
|
+
int32_t n_past) {
|
10502
10618
|
llama_kv_cache_seq_rm(ctx->kv_self, -1, n_past, -1);
|
10503
10619
|
|
10504
10620
|
llama_batch batch = { n_tokens, nullptr, embd, nullptr, nullptr, nullptr, nullptr, n_past, 1, 0, };
|
@@ -10569,7 +10685,7 @@ void llama_batch_free(struct llama_batch batch) {
|
|
10569
10685
|
if (batch.logits) free(batch.logits);
|
10570
10686
|
}
|
10571
10687
|
|
10572
|
-
|
10688
|
+
int32_t llama_decode(
|
10573
10689
|
struct llama_context * ctx,
|
10574
10690
|
struct llama_batch batch) {
|
10575
10691
|
const int ret = llama_decode_internal(*ctx, batch);
|
@@ -10617,11 +10733,11 @@ llama_token llama_token_nl(const struct llama_model * model) {
|
|
10617
10733
|
return model->vocab.linefeed_id;
|
10618
10734
|
}
|
10619
10735
|
|
10620
|
-
|
10736
|
+
int32_t llama_add_bos_token(const struct llama_model * model) {
|
10621
10737
|
return model->vocab.special_add_bos;
|
10622
10738
|
}
|
10623
10739
|
|
10624
|
-
|
10740
|
+
int32_t llama_add_eos_token(const struct llama_model * model) {
|
10625
10741
|
return model->vocab.special_add_eos;
|
10626
10742
|
}
|
10627
10743
|
|
@@ -10641,12 +10757,12 @@ llama_token llama_token_eot(const struct llama_model * model) {
|
|
10641
10757
|
return model->vocab.special_eot_id;
|
10642
10758
|
}
|
10643
10759
|
|
10644
|
-
|
10760
|
+
int32_t llama_tokenize(
|
10645
10761
|
const struct llama_model * model,
|
10646
10762
|
const char * text,
|
10647
|
-
|
10763
|
+
int32_t text_len,
|
10648
10764
|
llama_token * tokens,
|
10649
|
-
|
10765
|
+
int32_t n_max_tokens,
|
10650
10766
|
bool add_bos,
|
10651
10767
|
bool special) {
|
10652
10768
|
auto res = llama_tokenize_internal(model->vocab, std::string(text, text_len), add_bos, special);
|
@@ -10674,7 +10790,7 @@ static std::string llama_decode_text(const std::string & text) {
|
|
10674
10790
|
}
|
10675
10791
|
|
10676
10792
|
// does not write null-terminator to buf
|
10677
|
-
|
10793
|
+
int32_t llama_token_to_piece(const struct llama_model * model, llama_token token, char * buf, int32_t length) {
|
10678
10794
|
if (0 <= token && token < llama_n_vocab(model)) {
|
10679
10795
|
switch (llama_vocab_get_type(model->vocab)) {
|
10680
10796
|
case LLAMA_VOCAB_TYPE_SPM: {
|
@@ -10775,6 +10891,7 @@ const char * llama_print_system_info(void) {
|
|
10775
10891
|
|
10776
10892
|
s = "";
|
10777
10893
|
s += "AVX = " + std::to_string(ggml_cpu_has_avx()) + " | ";
|
10894
|
+
s += "AVX_VNNI = " + std::to_string(ggml_cpu_has_avx_vnni()) + " | ";
|
10778
10895
|
s += "AVX2 = " + std::to_string(ggml_cpu_has_avx2()) + " | ";
|
10779
10896
|
s += "AVX512 = " + std::to_string(ggml_cpu_has_avx512()) + " | ";
|
10780
10897
|
s += "AVX512_VBMI = " + std::to_string(ggml_cpu_has_avx512_vbmi()) + " | ";
|