llama_cpp 0.10.3 → 0.11.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +13 -0
- data/LICENSE.txt +1 -1
- data/ext/llama_cpp/extconf.rb +35 -110
- data/ext/llama_cpp/llama_cpp.cpp +52 -28
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +3 -1
- data/vendor/include/.gitkeep +0 -0
- data/vendor/lib/.gitkeep +0 -0
- data/vendor/tmp/llama.cpp/Makefile +758 -0
- data/{ext/llama_cpp/src → vendor/tmp/llama.cpp}/ggml-backend.c +6 -2
- data/{ext/llama_cpp/src → vendor/tmp/llama.cpp}/ggml-cuda.cu +73 -63
- data/{ext/llama_cpp/src → vendor/tmp/llama.cpp}/ggml-impl.h +1 -0
- data/{ext/llama_cpp/src → vendor/tmp/llama.cpp}/ggml-metal.m +43 -20
- data/{ext/llama_cpp/src → vendor/tmp/llama.cpp}/ggml-metal.metal +464 -245
- data/vendor/tmp/llama.cpp/ggml-opencl.h +25 -0
- data/{ext/llama_cpp/src → vendor/tmp/llama.cpp}/ggml-quants.c +61 -57
- data/{ext/llama_cpp/src → vendor/tmp/llama.cpp}/ggml.c +171 -5
- data/{ext/llama_cpp/src → vendor/tmp/llama.cpp}/ggml.h +1 -0
- data/{ext/llama_cpp/src → vendor/tmp/llama.cpp}/llama.cpp +222 -105
- data/{ext/llama_cpp/src → vendor/tmp/llama.cpp}/llama.h +31 -32
- data/vendor/tmp/llama.cpp/scripts/get-flags.mk +38 -0
- metadata +30 -27
- data/ext/llama_cpp/src/ggml-opencl.h +0 -25
- data/ext/llama_cpp/src/llama-util.h +0 -546
- /data/{ext/llama_cpp/src → vendor/tmp/llama.cpp}/LICENSE +0 -0
- /data/{ext/llama_cpp/src → vendor/tmp/llama.cpp}/ggml-alloc.c +0 -0
- /data/{ext/llama_cpp/src → vendor/tmp/llama.cpp}/ggml-alloc.h +0 -0
- /data/{ext/llama_cpp/src → vendor/tmp/llama.cpp}/ggml-backend-impl.h +0 -0
- /data/{ext/llama_cpp/src → vendor/tmp/llama.cpp}/ggml-backend.h +0 -0
- /data/{ext/llama_cpp/src → vendor/tmp/llama.cpp}/ggml-cuda.h +0 -0
- /data/{ext/llama_cpp/src → vendor/tmp/llama.cpp}/ggml-metal.h +0 -0
- /data/{ext/llama_cpp/src → vendor/tmp/llama.cpp}/ggml-mpi.c +0 -0
- /data/{ext/llama_cpp/src → vendor/tmp/llama.cpp}/ggml-mpi.h +0 -0
- /data/{ext/llama_cpp/src → vendor/tmp/llama.cpp}/ggml-opencl.cpp +0 -0
- /data/{ext/llama_cpp/src → vendor/tmp/llama.cpp}/ggml-quants.h +0 -0
- /data/{ext/llama_cpp/src → vendor/tmp/llama.cpp}/unicode.h +0 -0
@@ -245,6 +245,8 @@ enum llm_kv {
|
|
245
245
|
LLM_KV_ATTENTION_HEAD_COUNT_KV,
|
246
246
|
LLM_KV_ATTENTION_MAX_ALIBI_BIAS,
|
247
247
|
LLM_KV_ATTENTION_CLAMP_KQV,
|
248
|
+
LLM_KV_ATTENTION_KEY_LENGTH,
|
249
|
+
LLM_KV_ATTENTION_VALUE_LENGTH,
|
248
250
|
LLM_KV_ATTENTION_LAYERNORM_EPS,
|
249
251
|
LLM_KV_ATTENTION_LAYERNORM_RMS_EPS,
|
250
252
|
|
@@ -297,6 +299,8 @@ static std::map<llm_kv, std::string> LLM_KV_NAMES = {
|
|
297
299
|
{ LLM_KV_ATTENTION_HEAD_COUNT_KV, "%s.attention.head_count_kv" },
|
298
300
|
{ LLM_KV_ATTENTION_MAX_ALIBI_BIAS, "%s.attention.max_alibi_bias" },
|
299
301
|
{ LLM_KV_ATTENTION_CLAMP_KQV, "%s.attention.clamp_kqv" },
|
302
|
+
{ LLM_KV_ATTENTION_KEY_LENGTH, "%s.attention.key_length" },
|
303
|
+
{ LLM_KV_ATTENTION_VALUE_LENGTH, "%s.attention.value_length" },
|
300
304
|
{ LLM_KV_ATTENTION_LAYERNORM_EPS, "%s.attention.layer_norm_epsilon" },
|
301
305
|
{ LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, "%s.attention.layer_norm_rms_epsilon" },
|
302
306
|
|
@@ -1279,6 +1283,8 @@ struct llama_hparams {
|
|
1279
1283
|
uint32_t n_head_kv;
|
1280
1284
|
uint32_t n_layer;
|
1281
1285
|
uint32_t n_rot;
|
1286
|
+
uint32_t n_embd_head_k; // dimension of keys (d_k). d_q is assumed to be the same, but there are n_head q heads, and only n_head_kv k-v heads
|
1287
|
+
uint32_t n_embd_head_v; // dimension of values (d_v) aka n_embd_head
|
1282
1288
|
uint32_t n_ff;
|
1283
1289
|
uint32_t n_expert = 0;
|
1284
1290
|
uint32_t n_expert_used = 0;
|
@@ -1305,6 +1311,8 @@ struct llama_hparams {
|
|
1305
1311
|
if (this->n_head_kv != other.n_head_kv) return true;
|
1306
1312
|
if (this->n_layer != other.n_layer) return true;
|
1307
1313
|
if (this->n_rot != other.n_rot) return true;
|
1314
|
+
if (this->n_embd_head_k != other.n_embd_head_k) return true;
|
1315
|
+
if (this->n_embd_head_v != other.n_embd_head_v) return true;
|
1308
1316
|
if (this->n_ff != other.n_ff) return true;
|
1309
1317
|
if (this->n_expert != other.n_expert) return true;
|
1310
1318
|
if (this->n_expert_used != other.n_expert_used) return true;
|
@@ -1326,12 +1334,12 @@ struct llama_hparams {
|
|
1326
1334
|
return n_head/n_head_kv;
|
1327
1335
|
}
|
1328
1336
|
|
1329
|
-
uint32_t
|
1330
|
-
return
|
1337
|
+
uint32_t n_embd_k_gqa() const { // dimension of key embeddings across all k-v heads
|
1338
|
+
return n_embd_head_k * n_head_kv;
|
1331
1339
|
}
|
1332
1340
|
|
1333
|
-
uint32_t
|
1334
|
-
return
|
1341
|
+
uint32_t n_embd_v_gqa() const { // dimension of value embeddings across all k-v heads
|
1342
|
+
return n_embd_head_v * n_head_kv;
|
1335
1343
|
}
|
1336
1344
|
};
|
1337
1345
|
|
@@ -1640,8 +1648,9 @@ static bool llama_kv_cache_init(
|
|
1640
1648
|
uint32_t n_ctx,
|
1641
1649
|
int n_gpu_layers,
|
1642
1650
|
bool offload) {
|
1643
|
-
const uint32_t
|
1644
|
-
const uint32_t
|
1651
|
+
const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa();
|
1652
|
+
const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa();
|
1653
|
+
const uint32_t n_layer = hparams.n_layer;
|
1645
1654
|
|
1646
1655
|
cache.has_shift = false;
|
1647
1656
|
|
@@ -1672,8 +1681,8 @@ static bool llama_kv_cache_init(
|
|
1672
1681
|
const int i_gpu_start = (int) n_layer - n_gpu_layers;
|
1673
1682
|
|
1674
1683
|
for (int i = 0; i < (int) n_layer; i++) {
|
1675
|
-
ggml_tensor * k = ggml_new_tensor_1d(cache.ctx, ktype,
|
1676
|
-
ggml_tensor * v = ggml_new_tensor_1d(cache.ctx, vtype,
|
1684
|
+
ggml_tensor * k = ggml_new_tensor_1d(cache.ctx, ktype, n_embd_k_gqa*n_ctx);
|
1685
|
+
ggml_tensor * v = ggml_new_tensor_1d(cache.ctx, vtype, n_embd_v_gqa*n_ctx);
|
1677
1686
|
ggml_format_name(k, "cache_k_l%d", i);
|
1678
1687
|
ggml_format_name(v, "cache_v_l%d", i);
|
1679
1688
|
cache.k_l.push_back(k);
|
@@ -2667,6 +2676,12 @@ static void llm_load_hparams(
|
|
2667
2676
|
// gpt-j n_rot = rotary_dim
|
2668
2677
|
}
|
2669
2678
|
|
2679
|
+
hparams.n_embd_head_k = hparams.n_embd / hparams.n_head;
|
2680
|
+
ml.get_key(LLM_KV_ATTENTION_KEY_LENGTH, hparams.n_embd_head_k, false);
|
2681
|
+
|
2682
|
+
hparams.n_embd_head_v = hparams.n_embd / hparams.n_head;
|
2683
|
+
ml.get_key(LLM_KV_ATTENTION_VALUE_LENGTH, hparams.n_embd_head_v, false);
|
2684
|
+
|
2670
2685
|
// arch-specific KVs
|
2671
2686
|
switch (model.arch) {
|
2672
2687
|
case LLM_ARCH_LLAMA:
|
@@ -3077,8 +3092,12 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
|
|
3077
3092
|
LLAMA_LOG_INFO("%s: n_head = %u\n", __func__, hparams.n_head);
|
3078
3093
|
LLAMA_LOG_INFO("%s: n_head_kv = %u\n", __func__, hparams.n_head_kv);
|
3079
3094
|
LLAMA_LOG_INFO("%s: n_layer = %u\n", __func__, hparams.n_layer);
|
3080
|
-
LLAMA_LOG_INFO("%s: n_rot = %u\n", __func__, hparams.n_rot);
|
3095
|
+
LLAMA_LOG_INFO("%s: n_rot = %u\n", __func__, hparams.n_rot);
|
3096
|
+
LLAMA_LOG_INFO("%s: n_embd_head_k = %u\n", __func__, hparams.n_embd_head_k);
|
3097
|
+
LLAMA_LOG_INFO("%s: n_embd_head_v = %u\n", __func__, hparams.n_embd_head_v);
|
3081
3098
|
LLAMA_LOG_INFO("%s: n_gqa = %u\n", __func__, hparams.n_gqa());
|
3099
|
+
LLAMA_LOG_INFO("%s: n_embd_k_gqa = %u\n", __func__, hparams.n_embd_k_gqa());
|
3100
|
+
LLAMA_LOG_INFO("%s: n_embd_v_gqa = %u\n", __func__, hparams.n_embd_v_gqa());
|
3082
3101
|
LLAMA_LOG_INFO("%s: f_norm_eps = %.1e\n", __func__, hparams.f_norm_eps);
|
3083
3102
|
LLAMA_LOG_INFO("%s: f_norm_rms_eps = %.1e\n", __func__, hparams.f_norm_rms_eps);
|
3084
3103
|
LLAMA_LOG_INFO("%s: f_clamp_kqv = %.1e\n", __func__, hparams.f_clamp_kqv);
|
@@ -3168,10 +3187,11 @@ static bool llm_load_tensors(
|
|
3168
3187
|
|
3169
3188
|
// create tensors for the weights
|
3170
3189
|
{
|
3171
|
-
const int64_t n_embd
|
3172
|
-
const int64_t
|
3173
|
-
const int64_t
|
3174
|
-
const int64_t
|
3190
|
+
const int64_t n_embd = hparams.n_embd;
|
3191
|
+
const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa();
|
3192
|
+
const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa();
|
3193
|
+
const int64_t n_layer = hparams.n_layer;
|
3194
|
+
const int64_t n_vocab = hparams.n_vocab;
|
3175
3195
|
|
3176
3196
|
const auto tn = LLM_TN(model.arch);
|
3177
3197
|
switch (model.arch) {
|
@@ -3197,7 +3217,10 @@ static bool llm_load_tensors(
|
|
3197
3217
|
model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
|
3198
3218
|
}
|
3199
3219
|
|
3200
|
-
const uint32_t n_ff
|
3220
|
+
const uint32_t n_ff = hparams.n_ff;
|
3221
|
+
const int64_t n_embd_gqa = n_embd_v_gqa;
|
3222
|
+
GGML_ASSERT(n_embd_gqa == n_embd / hparams.n_gqa());
|
3223
|
+
GGML_ASSERT(n_embd_gqa == n_embd_k_gqa);
|
3201
3224
|
|
3202
3225
|
const int i_gpu_start = n_layer - n_gpu_layers;
|
3203
3226
|
|
@@ -3265,7 +3288,10 @@ static bool llm_load_tensors(
|
|
3265
3288
|
model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
|
3266
3289
|
}
|
3267
3290
|
|
3268
|
-
const uint32_t n_ff
|
3291
|
+
const uint32_t n_ff = hparams.n_ff;
|
3292
|
+
const int64_t n_embd_gqa = n_embd_v_gqa;
|
3293
|
+
GGML_ASSERT(n_embd_gqa == n_embd / hparams.n_gqa());
|
3294
|
+
GGML_ASSERT(n_embd_gqa == n_embd_k_gqa);
|
3269
3295
|
|
3270
3296
|
const int i_gpu_start = n_layer - n_gpu_layers;
|
3271
3297
|
|
@@ -3313,7 +3339,10 @@ static bool llm_load_tensors(
|
|
3313
3339
|
model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
|
3314
3340
|
}
|
3315
3341
|
|
3316
|
-
const uint32_t n_ff
|
3342
|
+
const uint32_t n_ff = hparams.n_ff;
|
3343
|
+
const int64_t n_embd_gqa = n_embd_v_gqa;
|
3344
|
+
GGML_ASSERT(n_embd_gqa == n_embd / hparams.n_gqa());
|
3345
|
+
GGML_ASSERT(n_embd_gqa == n_embd_k_gqa);
|
3317
3346
|
|
3318
3347
|
const int i_gpu_start = n_layer - n_gpu_layers;
|
3319
3348
|
|
@@ -3363,7 +3392,10 @@ static bool llm_load_tensors(
|
|
3363
3392
|
model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
|
3364
3393
|
}
|
3365
3394
|
|
3366
|
-
const uint32_t n_ff
|
3395
|
+
const uint32_t n_ff = hparams.n_ff;
|
3396
|
+
const int64_t n_embd_gqa = n_embd_v_gqa;
|
3397
|
+
GGML_ASSERT(n_embd_gqa == n_embd / hparams.n_gqa());
|
3398
|
+
GGML_ASSERT(n_embd_gqa == n_embd_k_gqa);
|
3367
3399
|
|
3368
3400
|
const int i_gpu_start = n_layer - n_gpu_layers;
|
3369
3401
|
|
@@ -3415,7 +3447,11 @@ static bool llm_load_tensors(
|
|
3415
3447
|
model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
|
3416
3448
|
}
|
3417
3449
|
|
3418
|
-
const uint32_t n_ff
|
3450
|
+
const uint32_t n_ff = hparams.n_ff;
|
3451
|
+
const int64_t n_embd_gqa = n_embd_v_gqa;
|
3452
|
+
GGML_ASSERT(n_embd_gqa == n_embd / hparams.n_gqa());
|
3453
|
+
GGML_ASSERT(n_embd_gqa == n_embd_k_gqa);
|
3454
|
+
|
3419
3455
|
const int i_gpu_start = n_layer - n_gpu_layers;
|
3420
3456
|
model.layers.resize(n_layer);
|
3421
3457
|
for (uint32_t i = 0; i < n_layer; ++i) {
|
@@ -3464,7 +3500,10 @@ static bool llm_load_tensors(
|
|
3464
3500
|
model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
|
3465
3501
|
}
|
3466
3502
|
|
3467
|
-
const uint32_t n_ff
|
3503
|
+
const uint32_t n_ff = hparams.n_ff;
|
3504
|
+
const int64_t n_embd_gqa = n_embd_v_gqa;
|
3505
|
+
GGML_ASSERT(n_embd_gqa == n_embd / hparams.n_gqa());
|
3506
|
+
GGML_ASSERT(n_embd_gqa == n_embd_k_gqa);
|
3468
3507
|
|
3469
3508
|
const int i_gpu_start = n_layer - n_gpu_layers;
|
3470
3509
|
|
@@ -3515,7 +3554,10 @@ static bool llm_load_tensors(
|
|
3515
3554
|
model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
|
3516
3555
|
}
|
3517
3556
|
|
3518
|
-
const uint32_t n_ff
|
3557
|
+
const uint32_t n_ff = hparams.n_ff;
|
3558
|
+
const int64_t n_embd_gqa = n_embd_v_gqa;
|
3559
|
+
GGML_ASSERT(n_embd_gqa == n_embd / hparams.n_gqa());
|
3560
|
+
GGML_ASSERT(n_embd_gqa == n_embd_k_gqa);
|
3519
3561
|
|
3520
3562
|
const int i_gpu_start = n_layer - n_gpu_layers;
|
3521
3563
|
|
@@ -3562,7 +3604,10 @@ static bool llm_load_tensors(
|
|
3562
3604
|
model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
|
3563
3605
|
}
|
3564
3606
|
|
3565
|
-
const uint32_t n_ff
|
3607
|
+
const uint32_t n_ff = hparams.n_ff;
|
3608
|
+
const int64_t n_embd_gqa = n_embd_v_gqa;
|
3609
|
+
GGML_ASSERT(n_embd_gqa == n_embd / hparams.n_gqa());
|
3610
|
+
GGML_ASSERT(n_embd_gqa == n_embd_k_gqa);
|
3566
3611
|
|
3567
3612
|
const int i_gpu_start = n_layer - n_gpu_layers;
|
3568
3613
|
|
@@ -3660,7 +3705,10 @@ static bool llm_load_tensors(
|
|
3660
3705
|
model.output_b = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "bias"), {n_vocab}, backend_output);
|
3661
3706
|
}
|
3662
3707
|
|
3663
|
-
const uint32_t n_ff
|
3708
|
+
const uint32_t n_ff = hparams.n_ff;
|
3709
|
+
const int64_t n_embd_gqa = n_embd_v_gqa;
|
3710
|
+
GGML_ASSERT(n_embd_gqa == n_embd / hparams.n_gqa());
|
3711
|
+
GGML_ASSERT(n_embd_gqa == n_embd_k_gqa);
|
3664
3712
|
|
3665
3713
|
const int i_gpu_start = n_layer - n_gpu_layers;
|
3666
3714
|
|
@@ -3709,7 +3757,10 @@ static bool llm_load_tensors(
|
|
3709
3757
|
model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
|
3710
3758
|
}
|
3711
3759
|
|
3712
|
-
const uint32_t n_ff
|
3760
|
+
const uint32_t n_ff = hparams.n_ff;
|
3761
|
+
const int64_t n_embd_gqa = n_embd_v_gqa;
|
3762
|
+
GGML_ASSERT(n_embd_gqa == n_embd / hparams.n_gqa());
|
3763
|
+
GGML_ASSERT(n_embd_gqa == n_embd_k_gqa);
|
3713
3764
|
|
3714
3765
|
const int i_gpu_start = n_layer - n_gpu_layers;
|
3715
3766
|
|
@@ -3756,7 +3807,10 @@ static bool llm_load_tensors(
|
|
3756
3807
|
model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
|
3757
3808
|
}
|
3758
3809
|
|
3759
|
-
const uint32_t n_ff
|
3810
|
+
const uint32_t n_ff = hparams.n_ff;
|
3811
|
+
const int64_t n_embd_gqa = n_embd_v_gqa;
|
3812
|
+
GGML_ASSERT(n_embd_gqa == n_embd / hparams.n_gqa());
|
3813
|
+
GGML_ASSERT(n_embd_gqa == n_embd_k_gqa);
|
3760
3814
|
|
3761
3815
|
const int i_gpu_start = n_layer - n_gpu_layers;
|
3762
3816
|
|
@@ -3995,8 +4049,8 @@ static struct ggml_tensor * llm_build_inp_embd(
|
|
3995
4049
|
return inpL;
|
3996
4050
|
}
|
3997
4051
|
|
3998
|
-
// Persimmon: n_rot =
|
3999
|
-
// Other: n_rot =
|
4052
|
+
// Persimmon: n_rot = n_embd_head_k/2
|
4053
|
+
// Other: n_rot = n_embd_head_k
|
4000
4054
|
static void llm_build_k_shift(
|
4001
4055
|
struct ggml_context * ctx,
|
4002
4056
|
const llama_hparams & hparams,
|
@@ -4009,17 +4063,17 @@ static void llm_build_k_shift(
|
|
4009
4063
|
float freq_base,
|
4010
4064
|
float freq_scale,
|
4011
4065
|
const llm_build_cb & cb) {
|
4012
|
-
const int64_t n_layer
|
4013
|
-
const int64_t n_head_kv
|
4014
|
-
const int64_t
|
4015
|
-
const int64_t
|
4016
|
-
const int32_t n_orig_ctx
|
4017
|
-
const float ext_factor
|
4018
|
-
const float attn_factor
|
4019
|
-
const float beta_fast
|
4020
|
-
const float beta_slow
|
4021
|
-
|
4022
|
-
GGML_ASSERT(
|
4066
|
+
const int64_t n_layer = hparams.n_layer;
|
4067
|
+
const int64_t n_head_kv = hparams.n_head_kv;
|
4068
|
+
const int64_t n_embd_head_k = hparams.n_embd_head_k;
|
4069
|
+
const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa();
|
4070
|
+
const int32_t n_orig_ctx = cparams.n_yarn_orig_ctx;
|
4071
|
+
const float ext_factor = cparams.yarn_ext_factor;
|
4072
|
+
const float attn_factor = cparams.yarn_attn_factor;
|
4073
|
+
const float beta_fast = cparams.yarn_beta_fast;
|
4074
|
+
const float beta_slow = cparams.yarn_beta_slow;
|
4075
|
+
|
4076
|
+
GGML_ASSERT(n_embd_head_k % n_rot == 0);
|
4023
4077
|
|
4024
4078
|
struct ggml_tensor * K_shift = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, n_ctx);
|
4025
4079
|
cb(K_shift, "K_shift", -1);
|
@@ -4037,9 +4091,9 @@ static void llm_build_k_shift(
|
|
4037
4091
|
// we rotate only the first n_rot dimensions
|
4038
4092
|
ggml_rope_custom_inplace(ctx,
|
4039
4093
|
ggml_view_3d(ctx, kv.k_l[il],
|
4040
|
-
|
4041
|
-
ggml_row_size(kv.k_l[il]->type,
|
4042
|
-
ggml_row_size(kv.k_l[il]->type,
|
4094
|
+
n_embd_head_k, n_head_kv, n_ctx,
|
4095
|
+
ggml_row_size(kv.k_l[il]->type, n_embd_head_k),
|
4096
|
+
ggml_row_size(kv.k_l[il]->type, n_embd_k_gqa),
|
4043
4097
|
0),
|
4044
4098
|
K_shift, n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
4045
4099
|
ext_factor, attn_factor, beta_fast, beta_slow);
|
@@ -4060,18 +4114,19 @@ static void llm_build_kv_store(
|
|
4060
4114
|
int32_t kv_head,
|
4061
4115
|
const llm_build_cb & cb,
|
4062
4116
|
int64_t il) {
|
4063
|
-
const int64_t
|
4117
|
+
const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa();
|
4118
|
+
const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa();
|
4064
4119
|
|
4065
4120
|
// compute the transposed [n_tokens, n_embd] V matrix
|
4066
|
-
struct ggml_tensor * v_cur_t = ggml_transpose(ctx, ggml_reshape_2d(ctx, v_cur,
|
4121
|
+
struct ggml_tensor * v_cur_t = ggml_transpose(ctx, ggml_reshape_2d(ctx, v_cur, n_embd_v_gqa, n_tokens));
|
4067
4122
|
//struct ggml_tensor * v_cur_t = ggml_transpose(ctx, v_cur); // TODO: reshape above is likely not needed
|
4068
4123
|
cb(v_cur_t, "v_cur_t", il);
|
4069
4124
|
|
4070
|
-
struct ggml_tensor * k_cache_view = ggml_view_1d(ctx, kv.k_l[il], n_tokens*
|
4071
|
-
(ggml_row_size(kv.k_l[il]->type,
|
4125
|
+
struct ggml_tensor * k_cache_view = ggml_view_1d(ctx, kv.k_l[il], n_tokens*n_embd_k_gqa,
|
4126
|
+
(ggml_row_size(kv.k_l[il]->type, n_embd_k_gqa))*kv_head);
|
4072
4127
|
cb(k_cache_view, "k_cache_view", il);
|
4073
4128
|
|
4074
|
-
struct ggml_tensor * v_cache_view = ggml_view_2d(ctx, kv.v_l[il], n_tokens,
|
4129
|
+
struct ggml_tensor * v_cache_view = ggml_view_2d(ctx, kv.v_l[il], n_tokens, n_embd_v_gqa,
|
4075
4130
|
( n_ctx)*ggml_element_size(kv.v_l[il]),
|
4076
4131
|
(kv_head)*ggml_element_size(kv.v_l[il]));
|
4077
4132
|
cb(v_cache_view, "v_cache_view", il);
|
@@ -4221,20 +4276,20 @@ static struct ggml_tensor * llm_build_kqv(
|
|
4221
4276
|
float kq_scale,
|
4222
4277
|
const llm_build_cb & cb,
|
4223
4278
|
int il) {
|
4224
|
-
const int64_t
|
4225
|
-
const int64_t
|
4226
|
-
const int64_t
|
4227
|
-
const int64_t
|
4228
|
-
const int64_t
|
4279
|
+
const int64_t n_head = hparams.n_head;
|
4280
|
+
const int64_t n_head_kv = hparams.n_head_kv;
|
4281
|
+
const int64_t n_embd_head_k = hparams.n_embd_head_k;
|
4282
|
+
const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa();
|
4283
|
+
const int64_t n_embd_head_v = hparams.n_embd_head_v;
|
4229
4284
|
|
4230
4285
|
struct ggml_tensor * q = ggml_permute(ctx, q_cur, 0, 2, 1, 3);
|
4231
4286
|
cb(q, "q", il);
|
4232
4287
|
|
4233
4288
|
struct ggml_tensor * k =
|
4234
4289
|
ggml_view_3d(ctx, kv.k_l[il],
|
4235
|
-
|
4236
|
-
ggml_row_size(kv.k_l[il]->type,
|
4237
|
-
ggml_row_size(kv.k_l[il]->type,
|
4290
|
+
n_embd_head_k, n_kv, n_head_kv,
|
4291
|
+
ggml_row_size(kv.k_l[il]->type, n_embd_k_gqa),
|
4292
|
+
ggml_row_size(kv.k_l[il]->type, n_embd_head_k),
|
4238
4293
|
0);
|
4239
4294
|
cb(k, "k", il);
|
4240
4295
|
|
@@ -4273,9 +4328,9 @@ static struct ggml_tensor * llm_build_kqv(
|
|
4273
4328
|
// split cached v into n_head heads
|
4274
4329
|
struct ggml_tensor * v =
|
4275
4330
|
ggml_view_3d(ctx, kv.v_l[il],
|
4276
|
-
n_kv,
|
4331
|
+
n_kv, n_embd_head_v, n_head_kv,
|
4277
4332
|
ggml_element_size(kv.v_l[il])*n_ctx,
|
4278
|
-
ggml_element_size(kv.v_l[il])*n_ctx*
|
4333
|
+
ggml_element_size(kv.v_l[il])*n_ctx*n_embd_head_v,
|
4279
4334
|
0);
|
4280
4335
|
cb(v, "v", il);
|
4281
4336
|
|
@@ -4285,7 +4340,7 @@ static struct ggml_tensor * llm_build_kqv(
|
|
4285
4340
|
struct ggml_tensor * kqv_merged = ggml_permute(ctx, kqv, 0, 2, 1, 3);
|
4286
4341
|
cb(kqv_merged, "kqv_merged", il);
|
4287
4342
|
|
4288
|
-
struct ggml_tensor * cur = ggml_cont_2d(ctx, kqv_merged,
|
4343
|
+
struct ggml_tensor * cur = ggml_cont_2d(ctx, kqv_merged, n_embd_head_k*n_head, n_tokens);
|
4289
4344
|
cb(cur, "kqv_merged_cont", il);
|
4290
4345
|
|
4291
4346
|
cur = ggml_mul_mat(ctx, wo, cur);
|
@@ -4312,8 +4367,10 @@ struct llm_build_context {
|
|
4312
4367
|
const int64_t n_ctx; // user-specified context size (can be different from n_ctx_train)
|
4313
4368
|
const int64_t n_head;
|
4314
4369
|
const int64_t n_head_kv;
|
4315
|
-
const int64_t
|
4316
|
-
const int64_t
|
4370
|
+
const int64_t n_embd_head_k;
|
4371
|
+
const int64_t n_embd_k_gqa;
|
4372
|
+
const int64_t n_embd_head_v;
|
4373
|
+
const int64_t n_embd_v_gqa;
|
4317
4374
|
const int64_t n_expert;
|
4318
4375
|
const int64_t n_expert_used;
|
4319
4376
|
|
@@ -4355,8 +4412,10 @@ struct llm_build_context {
|
|
4355
4412
|
n_ctx (cparams.n_ctx),
|
4356
4413
|
n_head (hparams.n_head),
|
4357
4414
|
n_head_kv (hparams.n_head_kv),
|
4358
|
-
|
4359
|
-
|
4415
|
+
n_embd_head_k (hparams.n_embd_head_k),
|
4416
|
+
n_embd_k_gqa (hparams.n_embd_k_gqa()),
|
4417
|
+
n_embd_head_v (hparams.n_embd_head_v),
|
4418
|
+
n_embd_v_gqa (hparams.n_embd_v_gqa()),
|
4360
4419
|
n_expert (hparams.n_expert),
|
4361
4420
|
n_expert_used (hparams.n_expert_used),
|
4362
4421
|
freq_base (cparams.rope_freq_base),
|
@@ -4399,6 +4458,8 @@ struct llm_build_context {
|
|
4399
4458
|
struct ggml_cgraph * build_llama() {
|
4400
4459
|
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
4401
4460
|
|
4461
|
+
const int64_t n_embd_head = hparams.n_embd_head_v;
|
4462
|
+
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
4402
4463
|
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
4403
4464
|
|
4404
4465
|
struct ggml_tensor * cur;
|
@@ -4583,6 +4644,9 @@ struct llm_build_context {
|
|
4583
4644
|
struct ggml_cgraph * build_baichuan() {
|
4584
4645
|
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
4585
4646
|
|
4647
|
+
const int64_t n_embd_head = hparams.n_embd_head_v;
|
4648
|
+
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
4649
|
+
|
4586
4650
|
struct ggml_tensor * cur;
|
4587
4651
|
struct ggml_tensor * inpL;
|
4588
4652
|
|
@@ -4700,6 +4764,11 @@ struct llm_build_context {
|
|
4700
4764
|
struct ggml_cgraph * build_falcon() {
|
4701
4765
|
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
4702
4766
|
|
4767
|
+
const int64_t n_embd_head = hparams.n_embd_head_v;
|
4768
|
+
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
|
4769
|
+
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
4770
|
+
GGML_ASSERT(n_embd_gqa == n_embd);
|
4771
|
+
|
4703
4772
|
struct ggml_tensor * cur;
|
4704
4773
|
struct ggml_tensor * inpL;
|
4705
4774
|
|
@@ -4819,6 +4888,11 @@ struct llm_build_context {
|
|
4819
4888
|
struct ggml_cgraph * build_starcoder() {
|
4820
4889
|
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
4821
4890
|
|
4891
|
+
const int64_t n_embd_head = hparams.n_embd_head_v;
|
4892
|
+
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
|
4893
|
+
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
4894
|
+
GGML_ASSERT(n_embd_gqa == n_embd);
|
4895
|
+
|
4822
4896
|
struct ggml_tensor * cur;
|
4823
4897
|
struct ggml_tensor * pos;
|
4824
4898
|
struct ggml_tensor * inpL;
|
@@ -4915,7 +4989,12 @@ struct llm_build_context {
|
|
4915
4989
|
struct ggml_cgraph * build_persimmon() {
|
4916
4990
|
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
4917
4991
|
|
4918
|
-
const int64_t
|
4992
|
+
const int64_t n_embd_head = hparams.n_embd_head_v;
|
4993
|
+
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
|
4994
|
+
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
4995
|
+
GGML_ASSERT(n_embd_gqa == n_embd);
|
4996
|
+
|
4997
|
+
const int64_t n_rot = n_embd_head_k / 2;
|
4919
4998
|
|
4920
4999
|
struct ggml_tensor * cur;
|
4921
5000
|
struct ggml_tensor * inpL;
|
@@ -5124,6 +5203,11 @@ struct llm_build_context {
|
|
5124
5203
|
struct ggml_cgraph * build_refact() {
|
5125
5204
|
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
5126
5205
|
|
5206
|
+
const int64_t n_embd_head = hparams.n_embd_head_v;
|
5207
|
+
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
|
5208
|
+
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
5209
|
+
GGML_ASSERT(n_embd_gqa == n_embd);
|
5210
|
+
|
5127
5211
|
struct ggml_tensor * cur;
|
5128
5212
|
struct ggml_tensor * inpL;
|
5129
5213
|
|
@@ -5212,6 +5296,11 @@ struct llm_build_context {
|
|
5212
5296
|
struct ggml_cgraph * build_bloom() {
|
5213
5297
|
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
5214
5298
|
|
5299
|
+
const int64_t n_embd_head = hparams.n_embd_head_v;
|
5300
|
+
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
|
5301
|
+
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
5302
|
+
GGML_ASSERT(n_embd_gqa == n_embd);
|
5303
|
+
|
5215
5304
|
struct ggml_tensor * cur;
|
5216
5305
|
struct ggml_tensor * inpL;
|
5217
5306
|
|
@@ -5303,6 +5392,11 @@ struct llm_build_context {
|
|
5303
5392
|
struct ggml_cgraph * build_mpt() {
|
5304
5393
|
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
5305
5394
|
|
5395
|
+
const int64_t n_embd_head = hparams.n_embd_head_v;
|
5396
|
+
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
|
5397
|
+
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
5398
|
+
GGML_ASSERT(n_embd_gqa == n_embd);
|
5399
|
+
|
5306
5400
|
struct ggml_tensor * cur;
|
5307
5401
|
struct ggml_tensor * inpL;
|
5308
5402
|
|
@@ -5398,6 +5492,9 @@ struct llm_build_context {
|
|
5398
5492
|
struct ggml_cgraph * build_stablelm() {
|
5399
5493
|
struct ggml_cgraph * gf = ggml_new_graph(ctx0);
|
5400
5494
|
|
5495
|
+
const int64_t n_embd_head = hparams.n_embd_head_v;
|
5496
|
+
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
5497
|
+
|
5401
5498
|
struct ggml_tensor * cur;
|
5402
5499
|
struct ggml_tensor * inpL;
|
5403
5500
|
|
@@ -5508,6 +5605,9 @@ struct llm_build_context {
|
|
5508
5605
|
struct ggml_cgraph * build_qwen() {
|
5509
5606
|
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
5510
5607
|
|
5608
|
+
const int64_t n_embd_head = hparams.n_embd_head_v;
|
5609
|
+
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
5610
|
+
|
5511
5611
|
struct ggml_tensor * cur;
|
5512
5612
|
struct ggml_tensor * inpL;
|
5513
5613
|
|
@@ -5619,6 +5719,11 @@ struct llm_build_context {
|
|
5619
5719
|
struct ggml_cgraph * build_phi2() {
|
5620
5720
|
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
5621
5721
|
|
5722
|
+
const int64_t n_embd_head = hparams.n_embd_head_v;
|
5723
|
+
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
|
5724
|
+
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
5725
|
+
GGML_ASSERT(n_embd_gqa == n_embd);
|
5726
|
+
|
5622
5727
|
struct ggml_tensor * cur;
|
5623
5728
|
struct ggml_tensor * attn_norm_output;
|
5624
5729
|
struct ggml_tensor * ffn_output;
|
@@ -5731,6 +5836,9 @@ struct llm_build_context {
|
|
5731
5836
|
struct ggml_cgraph * build_plamo() {
|
5732
5837
|
struct ggml_cgraph * gf = ggml_new_graph(ctx0);
|
5733
5838
|
|
5839
|
+
const int64_t n_embd_head = hparams.n_embd_head_v;
|
5840
|
+
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
5841
|
+
|
5734
5842
|
struct ggml_tensor * cur;
|
5735
5843
|
struct ggml_tensor * inpL;
|
5736
5844
|
|
@@ -5835,6 +5943,11 @@ struct llm_build_context {
|
|
5835
5943
|
struct ggml_cgraph * build_gpt2() {
|
5836
5944
|
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
5837
5945
|
|
5946
|
+
const int64_t n_embd_head = hparams.n_embd_head_v;
|
5947
|
+
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
|
5948
|
+
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
5949
|
+
GGML_ASSERT(n_embd_gqa == n_embd);
|
5950
|
+
|
5838
5951
|
struct ggml_tensor * cur;
|
5839
5952
|
struct ggml_tensor * pos;
|
5840
5953
|
struct ggml_tensor * inpL;
|
@@ -7912,7 +8025,7 @@ void llama_sample_softmax(struct llama_context * ctx, llama_token_data_array * c
|
|
7912
8025
|
}
|
7913
8026
|
}
|
7914
8027
|
|
7915
|
-
void llama_sample_top_k(struct llama_context * ctx, llama_token_data_array * candidates,
|
8028
|
+
void llama_sample_top_k(struct llama_context * ctx, llama_token_data_array * candidates, int32_t k, size_t min_keep) {
|
7916
8029
|
const int64_t t_start_sample_us = ggml_time_us();
|
7917
8030
|
|
7918
8031
|
k = std::max(k, (int) min_keep);
|
@@ -8272,7 +8385,7 @@ void llama_sample_classifier_free_guidance(
|
|
8272
8385
|
}
|
8273
8386
|
}
|
8274
8387
|
|
8275
|
-
llama_token llama_sample_token_mirostat(struct llama_context * ctx, llama_token_data_array * candidates, float tau, float eta,
|
8388
|
+
llama_token llama_sample_token_mirostat(struct llama_context * ctx, llama_token_data_array * candidates, float tau, float eta, int32_t m, float * mu) {
|
8276
8389
|
GGML_ASSERT(ctx);
|
8277
8390
|
|
8278
8391
|
auto N = float(llama_n_vocab(llama_get_model(ctx)));
|
@@ -9480,7 +9593,7 @@ struct llama_model_quantize_params llama_model_quantize_default_params() {
|
|
9480
9593
|
return result;
|
9481
9594
|
}
|
9482
9595
|
|
9483
|
-
|
9596
|
+
int32_t llama_max_devices(void) {
|
9484
9597
|
return LLAMA_MAX_DEVICES;
|
9485
9598
|
}
|
9486
9599
|
|
@@ -9622,8 +9735,8 @@ struct llama_context * llama_new_context_with_model(
|
|
9622
9735
|
const ggml_type type_k = params.type_k;
|
9623
9736
|
const ggml_type type_v = params.type_v;
|
9624
9737
|
|
9625
|
-
GGML_ASSERT(hparams.
|
9626
|
-
GGML_ASSERT(hparams.
|
9738
|
+
GGML_ASSERT(hparams.n_embd_head_k % ggml_blck_size(type_k) == 0);
|
9739
|
+
GGML_ASSERT(hparams.n_embd_head_v % ggml_blck_size(type_v) == 0);
|
9627
9740
|
|
9628
9741
|
// reserve memory for context buffers
|
9629
9742
|
if (!hparams.vocab_only) {
|
@@ -9791,15 +9904,15 @@ enum llama_vocab_type llama_vocab_type(const struct llama_model * model) {
|
|
9791
9904
|
return model->vocab.type;
|
9792
9905
|
}
|
9793
9906
|
|
9794
|
-
|
9907
|
+
int32_t llama_n_vocab(const struct llama_model * model) {
|
9795
9908
|
return model->vocab.id_to_token.size();
|
9796
9909
|
}
|
9797
9910
|
|
9798
|
-
|
9911
|
+
int32_t llama_n_ctx_train(const struct llama_model * model) {
|
9799
9912
|
return model->hparams.n_ctx_train;
|
9800
9913
|
}
|
9801
9914
|
|
9802
|
-
|
9915
|
+
int32_t llama_n_embd(const struct llama_model * model) {
|
9803
9916
|
return model->hparams.n_embd;
|
9804
9917
|
}
|
9805
9918
|
|
@@ -9807,7 +9920,7 @@ float llama_rope_freq_scale_train(const struct llama_model * model) {
|
|
9807
9920
|
return model->hparams.rope_freq_scale_train;
|
9808
9921
|
}
|
9809
9922
|
|
9810
|
-
|
9923
|
+
int32_t llama_model_meta_val_str(const struct llama_model * model, const char * key, char * buf, size_t buf_size) {
|
9811
9924
|
const auto & it = model->gguf_kv.find(key);
|
9812
9925
|
if (it == model->gguf_kv.end()) {
|
9813
9926
|
if (buf_size > 0) {
|
@@ -9818,11 +9931,11 @@ int llama_model_meta_val_str(const struct llama_model * model, const char * key,
|
|
9818
9931
|
return snprintf(buf, buf_size, "%s", it->second.c_str());
|
9819
9932
|
}
|
9820
9933
|
|
9821
|
-
|
9934
|
+
int32_t llama_model_meta_count(const struct llama_model * model) {
|
9822
9935
|
return (int)model->gguf_kv.size();
|
9823
9936
|
}
|
9824
9937
|
|
9825
|
-
|
9938
|
+
int32_t llama_model_meta_key_by_index(const struct llama_model * model, int i, char * buf, size_t buf_size) {
|
9826
9939
|
if (i < 0 || i >= (int)model->gguf_kv.size()) {
|
9827
9940
|
if (buf_size > 0) {
|
9828
9941
|
buf[0] = '\0';
|
@@ -9834,7 +9947,7 @@ int llama_model_meta_key_by_index(const struct llama_model * model, int i, char
|
|
9834
9947
|
return snprintf(buf, buf_size, "%s", it->first.c_str());
|
9835
9948
|
}
|
9836
9949
|
|
9837
|
-
|
9950
|
+
int32_t llama_model_meta_val_str_by_index(const struct llama_model * model, int32_t i, char * buf, size_t buf_size) {
|
9838
9951
|
if (i < 0 || i >= (int)model->gguf_kv.size()) {
|
9839
9952
|
if (buf_size > 0) {
|
9840
9953
|
buf[0] = '\0';
|
@@ -9846,9 +9959,10 @@ int llama_model_meta_val_str_by_index(const struct llama_model * model, int i, c
|
|
9846
9959
|
return snprintf(buf, buf_size, "%s", it->second.c_str());
|
9847
9960
|
}
|
9848
9961
|
|
9849
|
-
|
9850
|
-
return snprintf(buf, buf_size, "%s %s %s",
|
9962
|
+
int32_t llama_model_desc(const struct llama_model * model, char * buf, size_t buf_size) {
|
9963
|
+
return snprintf(buf, buf_size, "%s %s%s %s",
|
9851
9964
|
llama_model_arch_name(model->arch).c_str(),
|
9965
|
+
model->hparams.n_expert > 0 ? (std::to_string(model->hparams.n_expert) + "x").c_str() : "",
|
9852
9966
|
llama_model_type_name(model->type),
|
9853
9967
|
llama_model_ftype_name(model->ftype).c_str());
|
9854
9968
|
}
|
@@ -9873,7 +9987,7 @@ struct ggml_tensor * llama_get_model_tensor(struct llama_model * model, const ch
|
|
9873
9987
|
return ggml_get_tensor(model->ctx, name);
|
9874
9988
|
}
|
9875
9989
|
|
9876
|
-
|
9990
|
+
uint32_t llama_model_quantize(
|
9877
9991
|
const char * fname_inp,
|
9878
9992
|
const char * fname_out,
|
9879
9993
|
const llama_model_quantize_params * params) {
|
@@ -9886,7 +10000,7 @@ int llama_model_quantize(
|
|
9886
10000
|
}
|
9887
10001
|
}
|
9888
10002
|
|
9889
|
-
|
10003
|
+
int32_t llama_apply_lora_from_file(struct llama_context * ctx, const char * path_lora, float scale, const char * path_base_model, int32_t n_threads) {
|
9890
10004
|
try {
|
9891
10005
|
return llama_apply_lora_from_file_internal(ctx->model, path_lora, scale, path_base_model, n_threads);
|
9892
10006
|
} catch (const std::exception & err) {
|
@@ -9895,7 +10009,7 @@ int llama_apply_lora_from_file(struct llama_context * ctx, const char * path_lor
|
|
9895
10009
|
}
|
9896
10010
|
}
|
9897
10011
|
|
9898
|
-
|
10012
|
+
int32_t llama_model_apply_lora_from_file(const struct llama_model * model, const char * path_lora, float scale, const char * path_base_model, int32_t n_threads) {
|
9899
10013
|
try {
|
9900
10014
|
return llama_apply_lora_from_file_internal(*model, path_lora, scale, path_base_model, n_threads);
|
9901
10015
|
} catch (const std::exception & err) {
|
@@ -9993,7 +10107,7 @@ void llama_kv_cache_view_update(const struct llama_context * ctx, struct llama_k
|
|
9993
10107
|
}
|
9994
10108
|
}
|
9995
10109
|
|
9996
|
-
|
10110
|
+
int32_t llama_get_kv_cache_token_count(const struct llama_context * ctx) {
|
9997
10111
|
int result = 0;
|
9998
10112
|
|
9999
10113
|
for (uint32_t i = 0; i < ctx->kv_self.size; i++) {
|
@@ -10003,7 +10117,7 @@ int llama_get_kv_cache_token_count(const struct llama_context * ctx) {
|
|
10003
10117
|
return result;
|
10004
10118
|
}
|
10005
10119
|
|
10006
|
-
|
10120
|
+
int32_t llama_get_kv_cache_used_cells(const struct llama_context * ctx) {
|
10007
10121
|
return ctx->kv_self.used;
|
10008
10122
|
}
|
10009
10123
|
|
@@ -10167,9 +10281,10 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat
|
|
10167
10281
|
const auto & hparams = ctx->model.hparams;
|
10168
10282
|
const auto & cparams = ctx->cparams;
|
10169
10283
|
|
10170
|
-
const auto n_layer
|
10171
|
-
const auto
|
10172
|
-
const auto
|
10284
|
+
const auto n_layer = hparams.n_layer;
|
10285
|
+
const auto n_embd_k_gqa = hparams.n_embd_k_gqa();
|
10286
|
+
const auto n_embd_v_gqa = hparams.n_embd_v_gqa();
|
10287
|
+
const auto n_ctx = cparams.n_ctx;
|
10173
10288
|
|
10174
10289
|
const size_t kv_buf_size = ggml_backend_buffer_get_size(kv_self.buf);
|
10175
10290
|
const uint32_t kv_head = kv_self.head;
|
@@ -10191,15 +10306,15 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat
|
|
10191
10306
|
std::vector<struct ggml_tensor *> vout2d(n_layer);
|
10192
10307
|
|
10193
10308
|
for (int il = 0; il < (int) n_layer; ++il) {
|
10194
|
-
kout2d[il] = ggml_new_tensor_2d(cpy_ctx, kv_self.k_l[il]->type,
|
10195
|
-
vout2d[il] = ggml_new_tensor_2d(cpy_ctx, kv_self.v_l[il]->type, kv_head,
|
10309
|
+
kout2d[il] = ggml_new_tensor_2d(cpy_ctx, kv_self.k_l[il]->type, n_embd_k_gqa, kv_head);
|
10310
|
+
vout2d[il] = ggml_new_tensor_2d(cpy_ctx, kv_self.v_l[il]->type, kv_head, n_embd_v_gqa);
|
10196
10311
|
|
10197
10312
|
ggml_tensor * k2d = ggml_view_2d(cpy_ctx, kv_self.k_l[il],
|
10198
|
-
|
10199
|
-
elt_size*
|
10313
|
+
n_embd_k_gqa, kv_head,
|
10314
|
+
elt_size*n_embd_k_gqa, 0);
|
10200
10315
|
|
10201
10316
|
ggml_tensor * v2d = ggml_view_2d(cpy_ctx, kv_self.v_l[il],
|
10202
|
-
kv_head,
|
10317
|
+
kv_head, n_embd_v_gqa,
|
10203
10318
|
elt_size*n_ctx, 0);
|
10204
10319
|
|
10205
10320
|
ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, k2d, kout2d[il]));
|
@@ -10306,9 +10421,10 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
|
|
10306
10421
|
const auto & hparams = ctx->model.hparams;
|
10307
10422
|
const auto & cparams = ctx->cparams;
|
10308
10423
|
|
10309
|
-
const int n_layer
|
10310
|
-
const int
|
10311
|
-
const int
|
10424
|
+
const int n_layer = hparams.n_layer;
|
10425
|
+
const int n_embd_k_gqa = hparams.n_embd_k_gqa();
|
10426
|
+
const int n_embd_v_gqa = hparams.n_embd_v_gqa();
|
10427
|
+
const int n_ctx = cparams.n_ctx;
|
10312
10428
|
|
10313
10429
|
size_t kv_buf_size;
|
10314
10430
|
uint32_t kv_head;
|
@@ -10332,15 +10448,15 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
|
|
10332
10448
|
std::vector<struct ggml_tensor *> vin2d(n_layer);
|
10333
10449
|
|
10334
10450
|
for (int il = 0; il < n_layer; ++il) {
|
10335
|
-
kin2d[il] = ggml_new_tensor_2d(cpy_ctx, kv_self.k_l[il]->type,
|
10336
|
-
vin2d[il] = ggml_new_tensor_2d(cpy_ctx, kv_self.v_l[il]->type, kv_head,
|
10451
|
+
kin2d[il] = ggml_new_tensor_2d(cpy_ctx, kv_self.k_l[il]->type, n_embd_k_gqa, kv_head);
|
10452
|
+
vin2d[il] = ggml_new_tensor_2d(cpy_ctx, kv_self.v_l[il]->type, kv_head, n_embd_v_gqa);
|
10337
10453
|
|
10338
10454
|
ggml_tensor * k2d = ggml_view_2d(cpy_ctx, kv_self.k_l[il],
|
10339
|
-
|
10340
|
-
elt_size*
|
10455
|
+
n_embd_k_gqa, kv_head,
|
10456
|
+
elt_size*n_embd_k_gqa, 0);
|
10341
10457
|
|
10342
10458
|
ggml_tensor * v2d = ggml_view_2d(cpy_ctx, kv_self.v_l[il],
|
10343
|
-
kv_head,
|
10459
|
+
kv_head, n_embd_v_gqa,
|
10344
10460
|
elt_size*n_ctx, 0);
|
10345
10461
|
|
10346
10462
|
ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, kin2d[il], k2d));
|
@@ -10483,7 +10599,7 @@ int llama_eval(
|
|
10483
10599
|
struct llama_context * ctx,
|
10484
10600
|
llama_token * tokens,
|
10485
10601
|
int32_t n_tokens,
|
10486
|
-
|
10602
|
+
int32_t n_past) {
|
10487
10603
|
llama_kv_cache_seq_rm(ctx->kv_self, -1, n_past, -1);
|
10488
10604
|
|
10489
10605
|
const int ret = llama_decode_internal(*ctx, llama_batch_get_one(tokens, n_tokens, n_past, 0));
|
@@ -10498,7 +10614,7 @@ int llama_eval_embd(
|
|
10498
10614
|
struct llama_context * ctx,
|
10499
10615
|
float * embd,
|
10500
10616
|
int32_t n_tokens,
|
10501
|
-
|
10617
|
+
int32_t n_past) {
|
10502
10618
|
llama_kv_cache_seq_rm(ctx->kv_self, -1, n_past, -1);
|
10503
10619
|
|
10504
10620
|
llama_batch batch = { n_tokens, nullptr, embd, nullptr, nullptr, nullptr, nullptr, n_past, 1, 0, };
|
@@ -10569,7 +10685,7 @@ void llama_batch_free(struct llama_batch batch) {
|
|
10569
10685
|
if (batch.logits) free(batch.logits);
|
10570
10686
|
}
|
10571
10687
|
|
10572
|
-
|
10688
|
+
int32_t llama_decode(
|
10573
10689
|
struct llama_context * ctx,
|
10574
10690
|
struct llama_batch batch) {
|
10575
10691
|
const int ret = llama_decode_internal(*ctx, batch);
|
@@ -10617,11 +10733,11 @@ llama_token llama_token_nl(const struct llama_model * model) {
|
|
10617
10733
|
return model->vocab.linefeed_id;
|
10618
10734
|
}
|
10619
10735
|
|
10620
|
-
|
10736
|
+
int32_t llama_add_bos_token(const struct llama_model * model) {
|
10621
10737
|
return model->vocab.special_add_bos;
|
10622
10738
|
}
|
10623
10739
|
|
10624
|
-
|
10740
|
+
int32_t llama_add_eos_token(const struct llama_model * model) {
|
10625
10741
|
return model->vocab.special_add_eos;
|
10626
10742
|
}
|
10627
10743
|
|
@@ -10641,12 +10757,12 @@ llama_token llama_token_eot(const struct llama_model * model) {
|
|
10641
10757
|
return model->vocab.special_eot_id;
|
10642
10758
|
}
|
10643
10759
|
|
10644
|
-
|
10760
|
+
int32_t llama_tokenize(
|
10645
10761
|
const struct llama_model * model,
|
10646
10762
|
const char * text,
|
10647
|
-
|
10763
|
+
int32_t text_len,
|
10648
10764
|
llama_token * tokens,
|
10649
|
-
|
10765
|
+
int32_t n_max_tokens,
|
10650
10766
|
bool add_bos,
|
10651
10767
|
bool special) {
|
10652
10768
|
auto res = llama_tokenize_internal(model->vocab, std::string(text, text_len), add_bos, special);
|
@@ -10674,7 +10790,7 @@ static std::string llama_decode_text(const std::string & text) {
|
|
10674
10790
|
}
|
10675
10791
|
|
10676
10792
|
// does not write null-terminator to buf
|
10677
|
-
|
10793
|
+
int32_t llama_token_to_piece(const struct llama_model * model, llama_token token, char * buf, int32_t length) {
|
10678
10794
|
if (0 <= token && token < llama_n_vocab(model)) {
|
10679
10795
|
switch (llama_vocab_get_type(model->vocab)) {
|
10680
10796
|
case LLAMA_VOCAB_TYPE_SPM: {
|
@@ -10775,6 +10891,7 @@ const char * llama_print_system_info(void) {
|
|
10775
10891
|
|
10776
10892
|
s = "";
|
10777
10893
|
s += "AVX = " + std::to_string(ggml_cpu_has_avx()) + " | ";
|
10894
|
+
s += "AVX_VNNI = " + std::to_string(ggml_cpu_has_avx_vnni()) + " | ";
|
10778
10895
|
s += "AVX2 = " + std::to_string(ggml_cpu_has_avx2()) + " | ";
|
10779
10896
|
s += "AVX512 = " + std::to_string(ggml_cpu_has_avx512()) + " | ";
|
10780
10897
|
s += "AVX512_VBMI = " + std::to_string(ggml_cpu_has_avx512_vbmi()) + " | ";
|