llama_cpp 0.14.7 → 0.15.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +13 -0
- data/README.md +2 -2
- data/ext/llama_cpp/extconf.rb +2 -1
- data/ext/llama_cpp/llama_cpp.cpp +53 -9
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +18 -3
- data/vendor/tmp/llama.cpp/Makefile +41 -16
- data/vendor/tmp/llama.cpp/ggml-backend.c +7 -5
- data/vendor/tmp/llama.cpp/ggml-cuda.cu +6 -0
- data/vendor/tmp/llama.cpp/ggml-impl.h +1 -1
- data/vendor/tmp/llama.cpp/ggml-kompute.cpp +7 -0
- data/vendor/tmp/llama.cpp/ggml-metal.m +376 -176
- data/vendor/tmp/llama.cpp/ggml-metal.metal +654 -18
- data/vendor/tmp/llama.cpp/ggml-quants.c +284 -0
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +17 -7
- data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +5 -0
- data/vendor/tmp/llama.cpp/ggml.c +391 -27
- data/vendor/tmp/llama.cpp/ggml.h +22 -0
- data/vendor/tmp/llama.cpp/llama.cpp +623 -395
- data/vendor/tmp/llama.cpp/llama.h +27 -9
- data/vendor/tmp/llama.cpp/sgemm.cpp +83 -87
- data/vendor/tmp/llama.cpp/sgemm.h +4 -2
- data/vendor/tmp/llama.cpp/unicode-data.cpp +1 -1
- data/vendor/tmp/llama.cpp/unicode-data.h +2 -2
- data/vendor/tmp/llama.cpp/unicode.cpp +448 -39
- data/vendor/tmp/llama.cpp/unicode.h +2 -1
- metadata +3 -3
@@ -75,6 +75,7 @@
|
|
75
75
|
#include <forward_list>
|
76
76
|
#include <fstream>
|
77
77
|
#include <functional>
|
78
|
+
#include <future>
|
78
79
|
#include <initializer_list>
|
79
80
|
#include <locale>
|
80
81
|
#include <map>
|
@@ -107,7 +108,6 @@
|
|
107
108
|
#define LLAMA_MAX_NODES 8192
|
108
109
|
#define LLAMA_MAX_EXPERTS 60
|
109
110
|
|
110
|
-
|
111
111
|
//
|
112
112
|
// logging
|
113
113
|
//
|
@@ -316,6 +316,7 @@ enum llm_kv {
|
|
316
316
|
LLM_KV_SSM_TIME_STEP_RANK,
|
317
317
|
|
318
318
|
LLM_KV_TOKENIZER_MODEL,
|
319
|
+
LLM_KV_TOKENIZER_PRE,
|
319
320
|
LLM_KV_TOKENIZER_LIST,
|
320
321
|
LLM_KV_TOKENIZER_TOKEN_TYPE,
|
321
322
|
LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT,
|
@@ -392,6 +393,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
|
392
393
|
{ LLM_KV_SSM_TIME_STEP_RANK, "%s.ssm.time_step_rank" },
|
393
394
|
|
394
395
|
{ LLM_KV_TOKENIZER_MODEL, "tokenizer.ggml.model" },
|
396
|
+
{ LLM_KV_TOKENIZER_PRE, "tokenizer.ggml.pre" },
|
395
397
|
{ LLM_KV_TOKENIZER_LIST, "tokenizer.ggml.tokens" },
|
396
398
|
{ LLM_KV_TOKENIZER_TOKEN_TYPE, "tokenizer.ggml.token_type" },
|
397
399
|
{ LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT, "tokenizer.ggml.token_type_count" },
|
@@ -1843,7 +1845,7 @@ struct llama_hparams {
|
|
1843
1845
|
float f_logit_scale = 0.0f;
|
1844
1846
|
|
1845
1847
|
bool causal_attn = true;
|
1846
|
-
bool
|
1848
|
+
bool use_alibi = false; // currently, we need KQ_pos data for ALiBi-based models
|
1847
1849
|
|
1848
1850
|
enum llama_pooling_type pooling_type = LLAMA_POOLING_TYPE_NONE;
|
1849
1851
|
enum llama_rope_type rope_type = LLAMA_ROPE_TYPE_NONE;
|
@@ -1933,6 +1935,7 @@ struct llama_cparams {
|
|
1933
1935
|
bool embeddings;
|
1934
1936
|
bool causal_attn;
|
1935
1937
|
bool offload_kqv;
|
1938
|
+
bool flash_attn;
|
1936
1939
|
|
1937
1940
|
enum llama_pooling_type pooling_type;
|
1938
1941
|
|
@@ -2036,8 +2039,8 @@ struct llama_kv_cache {
|
|
2036
2039
|
bool has_shift = false;
|
2037
2040
|
bool do_defrag = false;
|
2038
2041
|
bool do_copy = false;
|
2039
|
-
// with recurrent state models, a cell can hold the state for more than one past token
|
2040
|
-
bool
|
2042
|
+
bool recurrent = false; // with recurrent state models, a cell can hold the state for more than one past token
|
2043
|
+
bool v_trans = true; // the value tensor is transposed
|
2041
2044
|
|
2042
2045
|
// Note: The value of head isn't only used to optimize searching
|
2043
2046
|
// for a free KV slot. llama_decode_internal also uses it, so it
|
@@ -2114,7 +2117,8 @@ struct llama_vocab {
|
|
2114
2117
|
ttype type;
|
2115
2118
|
};
|
2116
2119
|
|
2117
|
-
enum llama_vocab_type
|
2120
|
+
enum llama_vocab_type type = LLAMA_VOCAB_TYPE_SPM;
|
2121
|
+
enum llama_vocab_pre_type type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
|
2118
2122
|
|
2119
2123
|
std::unordered_map<token, id> token_to_id;
|
2120
2124
|
std::vector<token_data> id_to_token;
|
@@ -2335,11 +2339,14 @@ struct llama_context {
|
|
2335
2339
|
|
2336
2340
|
static bool llama_kv_cache_init(
|
2337
2341
|
struct llama_kv_cache & cache,
|
2338
|
-
|
2342
|
+
const llama_context * ctx,
|
2339
2343
|
ggml_type type_k,
|
2340
2344
|
ggml_type type_v,
|
2341
2345
|
uint32_t kv_size,
|
2342
2346
|
bool offload) {
|
2347
|
+
const llama_model & model = ctx->model;
|
2348
|
+
const llama_cparams & cparams = ctx->cparams;
|
2349
|
+
|
2343
2350
|
const struct llama_hparams & hparams = model.hparams;
|
2344
2351
|
|
2345
2352
|
const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa() + hparams.n_embd_k_s();
|
@@ -2350,8 +2357,9 @@ static bool llama_kv_cache_init(
|
|
2350
2357
|
|
2351
2358
|
// TODO: find a nicer way to add other recurrent model architectures
|
2352
2359
|
cache.recurrent = model.arch == LLM_ARCH_MAMBA;
|
2360
|
+
cache.v_trans = !cparams.flash_attn;
|
2353
2361
|
|
2354
|
-
// TODO: support mixed
|
2362
|
+
// TODO: support mixed recurrent Transformer architectures
|
2355
2363
|
// NOTE: (!a || b) is a logical implication (a -> b)
|
2356
2364
|
GGML_ASSERT(!cache.recurrent || n_embd_k_gqa == hparams.n_embd_k_s());
|
2357
2365
|
GGML_ASSERT(!cache.recurrent || n_embd_v_gqa == hparams.n_embd_v_s());
|
@@ -2562,6 +2570,10 @@ static void llama_kv_cache_clear(struct llama_kv_cache & cache) {
|
|
2562
2570
|
}
|
2563
2571
|
cache.head = 0;
|
2564
2572
|
cache.used = 0;
|
2573
|
+
|
2574
|
+
for (auto & buf : cache.bufs) {
|
2575
|
+
ggml_backend_buffer_clear(buf, 0);
|
2576
|
+
}
|
2565
2577
|
}
|
2566
2578
|
|
2567
2579
|
static bool llama_kv_cache_seq_rm(
|
@@ -2882,6 +2894,7 @@ namespace GGUFMeta {
|
|
2882
2894
|
case LLAMA_KV_OVERRIDE_TYPE_BOOL: return "bool";
|
2883
2895
|
case LLAMA_KV_OVERRIDE_TYPE_INT: return "int";
|
2884
2896
|
case LLAMA_KV_OVERRIDE_TYPE_FLOAT: return "float";
|
2897
|
+
case LLAMA_KV_OVERRIDE_TYPE_STR: return "str";
|
2885
2898
|
}
|
2886
2899
|
return "unknown";
|
2887
2900
|
}
|
@@ -2893,13 +2906,16 @@ namespace GGUFMeta {
|
|
2893
2906
|
__func__, override_type_to_str(ovrd->tag), ovrd->key);
|
2894
2907
|
switch (ovrd->tag) {
|
2895
2908
|
case LLAMA_KV_OVERRIDE_TYPE_BOOL: {
|
2896
|
-
LLAMA_LOG_INFO("%s\n", ovrd->
|
2909
|
+
LLAMA_LOG_INFO("%s\n", ovrd->val_bool ? "true" : "false");
|
2897
2910
|
} break;
|
2898
2911
|
case LLAMA_KV_OVERRIDE_TYPE_INT: {
|
2899
|
-
LLAMA_LOG_INFO("%" PRId64 "\n", ovrd->
|
2912
|
+
LLAMA_LOG_INFO("%" PRId64 "\n", ovrd->val_i64);
|
2900
2913
|
} break;
|
2901
2914
|
case LLAMA_KV_OVERRIDE_TYPE_FLOAT: {
|
2902
|
-
LLAMA_LOG_INFO("%.6f\n", ovrd->
|
2915
|
+
LLAMA_LOG_INFO("%.6f\n", ovrd->val_f64);
|
2916
|
+
} break;
|
2917
|
+
case LLAMA_KV_OVERRIDE_TYPE_STR: {
|
2918
|
+
LLAMA_LOG_INFO("%s\n", ovrd->val_str);
|
2903
2919
|
} break;
|
2904
2920
|
default:
|
2905
2921
|
// Shouldn't be possible to end up here, but just in case...
|
@@ -2918,7 +2934,7 @@ namespace GGUFMeta {
|
|
2918
2934
|
static typename std::enable_if<std::is_same<OT, bool>::value, bool>::type
|
2919
2935
|
try_override(OT & target, const struct llama_model_kv_override * ovrd) {
|
2920
2936
|
if (validate_override(LLAMA_KV_OVERRIDE_TYPE_BOOL, ovrd)) {
|
2921
|
-
target = ovrd->
|
2937
|
+
target = ovrd->val_bool;
|
2922
2938
|
return true;
|
2923
2939
|
}
|
2924
2940
|
return false;
|
@@ -2928,7 +2944,7 @@ namespace GGUFMeta {
|
|
2928
2944
|
static typename std::enable_if<!std::is_same<OT, bool>::value && std::is_integral<OT>::value, bool>::type
|
2929
2945
|
try_override(OT & target, const struct llama_model_kv_override * ovrd) {
|
2930
2946
|
if (validate_override(LLAMA_KV_OVERRIDE_TYPE_INT, ovrd)) {
|
2931
|
-
target = ovrd->
|
2947
|
+
target = ovrd->val_i64;
|
2932
2948
|
return true;
|
2933
2949
|
}
|
2934
2950
|
return false;
|
@@ -2938,7 +2954,7 @@ namespace GGUFMeta {
|
|
2938
2954
|
static typename std::enable_if<std::is_floating_point<OT>::value, bool>::type
|
2939
2955
|
try_override(T & target, const struct llama_model_kv_override * ovrd) {
|
2940
2956
|
if (validate_override(LLAMA_KV_OVERRIDE_TYPE_FLOAT, ovrd)) {
|
2941
|
-
target = ovrd->
|
2957
|
+
target = ovrd->val_f64;
|
2942
2958
|
return true;
|
2943
2959
|
}
|
2944
2960
|
return false;
|
@@ -2947,12 +2963,11 @@ namespace GGUFMeta {
|
|
2947
2963
|
template<typename OT>
|
2948
2964
|
static typename std::enable_if<std::is_same<OT, std::string>::value, bool>::type
|
2949
2965
|
try_override(T & target, const struct llama_model_kv_override * ovrd) {
|
2950
|
-
(
|
2951
|
-
|
2952
|
-
|
2953
|
-
|
2954
|
-
|
2955
|
-
ovrd ? ovrd->key : "NULL"));
|
2966
|
+
if (validate_override(LLAMA_KV_OVERRIDE_TYPE_STR, ovrd)) {
|
2967
|
+
target = ovrd->val_str;
|
2968
|
+
return true;
|
2969
|
+
}
|
2970
|
+
return false;
|
2956
2971
|
}
|
2957
2972
|
|
2958
2973
|
static bool set(const gguf_context * ctx, const int k, T & target, const struct llama_model_kv_override * ovrd = nullptr) {
|
@@ -2985,6 +3000,7 @@ struct llama_model_loader {
|
|
2985
3000
|
size_t n_bytes = 0;
|
2986
3001
|
|
2987
3002
|
bool use_mmap = false;
|
3003
|
+
bool check_tensors;
|
2988
3004
|
|
2989
3005
|
llama_files files;
|
2990
3006
|
llama_ftype ftype;
|
@@ -3018,7 +3034,7 @@ struct llama_model_loader {
|
|
3018
3034
|
std::string arch_name;
|
3019
3035
|
LLM_KV llm_kv = LLM_KV(LLM_ARCH_UNKNOWN);
|
3020
3036
|
|
3021
|
-
llama_model_loader(const std::string & fname, bool use_mmap, const struct llama_model_kv_override * param_overrides_p) {
|
3037
|
+
llama_model_loader(const std::string & fname, bool use_mmap, bool check_tensors, const struct llama_model_kv_override * param_overrides_p) {
|
3022
3038
|
int trace = 0;
|
3023
3039
|
if (getenv("LLAMA_TRACE")) {
|
3024
3040
|
trace = atoi(getenv("LLAMA_TRACE"));
|
@@ -3115,9 +3131,17 @@ struct llama_model_loader {
|
|
3115
3131
|
|
3116
3132
|
fver = (enum llama_fver) gguf_get_version(meta);
|
3117
3133
|
|
3134
|
+
std::set<std::string> tensor_names;
|
3118
3135
|
for (auto & w : weights) {
|
3119
3136
|
n_elements += ggml_nelements(w.tensor);
|
3120
3137
|
n_bytes += ggml_nbytes(w.tensor);
|
3138
|
+
// make sure there is no duplicated tensor names
|
3139
|
+
const std::string name(w.tensor->name);
|
3140
|
+
auto found = tensor_names.find(name);
|
3141
|
+
if (found != tensor_names.end()) {
|
3142
|
+
throw std::runtime_error(format("invalid model: tensor '%s' is duplicated", w.tensor->name));
|
3143
|
+
}
|
3144
|
+
tensor_names.insert(name);
|
3121
3145
|
}
|
3122
3146
|
|
3123
3147
|
LLAMA_LOG_INFO("%s: loaded meta data with %d key-value pairs and %d tensors from %s (version %s)\n",
|
@@ -3223,6 +3247,7 @@ struct llama_model_loader {
|
|
3223
3247
|
}
|
3224
3248
|
|
3225
3249
|
this->use_mmap = use_mmap;
|
3250
|
+
this->check_tensors = check_tensors;
|
3226
3251
|
}
|
3227
3252
|
|
3228
3253
|
~llama_model_loader() {
|
@@ -3481,6 +3506,10 @@ struct llama_model_loader {
|
|
3481
3506
|
file->seek(w.offs, SEEK_SET);
|
3482
3507
|
file->read_raw(cur->data, ggml_nbytes(cur));
|
3483
3508
|
}
|
3509
|
+
|
3510
|
+
if (check_tensors && !ggml_validate_row_data(cur->type, cur->data, ggml_nbytes(cur))) {
|
3511
|
+
throw std::runtime_error(format("tensor '%s' has invalid data", ggml_get_name(cur)));
|
3512
|
+
}
|
3484
3513
|
}
|
3485
3514
|
|
3486
3515
|
size_t size_done = 0;
|
@@ -3497,6 +3526,8 @@ struct llama_model_loader {
|
|
3497
3526
|
GGML_ASSERT(size_data != 0 && "call init_mappings() first");
|
3498
3527
|
|
3499
3528
|
std::vector<no_init<uint8_t>> read_buf;
|
3529
|
+
std::vector<std::future<std::pair<ggml_tensor *, bool>>> validation_result;
|
3530
|
+
|
3500
3531
|
for (struct ggml_tensor * cur = ggml_get_first_tensor(ctx); cur != NULL; cur = ggml_get_next_tensor(ctx, cur)) {
|
3501
3532
|
const auto * weight = get_weight(ggml_get_name(cur));
|
3502
3533
|
if (weight == nullptr) {
|
@@ -3518,37 +3549,66 @@ struct llama_model_loader {
|
|
3518
3549
|
if (bufs_mmap.count(weight->idx)) {
|
3519
3550
|
buf_mmap = bufs_mmap.at(weight->idx);
|
3520
3551
|
}
|
3552
|
+
uint8_t * data = (uint8_t *) mapping->addr + weight->offs;
|
3553
|
+
|
3554
|
+
if (check_tensors) {
|
3555
|
+
validation_result.emplace_back(std::async(std::launch::async, [cur, data, n_size] {
|
3556
|
+
return std::make_pair(cur, ggml_validate_row_data(cur->type, data, n_size));
|
3557
|
+
}));
|
3558
|
+
}
|
3559
|
+
|
3521
3560
|
GGML_ASSERT(buf_mmap || cur->data); // either we have a buffer to allocate the tensor in, or it is already allocated
|
3522
3561
|
if (buf_mmap && cur->data == nullptr) {
|
3523
|
-
ggml_backend_tensor_alloc(buf_mmap, cur,
|
3562
|
+
ggml_backend_tensor_alloc(buf_mmap, cur, data);
|
3524
3563
|
if (lmlocks) {
|
3525
3564
|
const auto & lmlock = lmlocks->at(weight->idx);
|
3526
|
-
lmlock->grow_to(weight->offs +
|
3565
|
+
lmlock->grow_to(weight->offs + n_size);
|
3527
3566
|
}
|
3528
3567
|
|
3529
3568
|
auto & mmap_used = mmaps_used[weight->idx];
|
3530
3569
|
mmap_used.first = std::min(mmap_used.first, weight->offs);
|
3531
3570
|
mmap_used.second = std::max(mmap_used.second, weight->offs + n_size);
|
3532
3571
|
} else {
|
3533
|
-
ggml_backend_tensor_set(cur,
|
3572
|
+
ggml_backend_tensor_set(cur, data, 0, n_size);
|
3534
3573
|
}
|
3535
3574
|
} else {
|
3536
3575
|
GGML_ASSERT(weight->idx < files.size());
|
3537
3576
|
const auto & file = files.at(weight->idx);
|
3538
3577
|
if (ggml_backend_buffer_is_host(cur->buffer)) {
|
3539
3578
|
file->seek(weight->offs, SEEK_SET);
|
3540
|
-
file->read_raw(cur->data,
|
3579
|
+
file->read_raw(cur->data, n_size);
|
3580
|
+
if (check_tensors) {
|
3581
|
+
validation_result.emplace_back(std::async(std::launch::async, [cur, n_size] {
|
3582
|
+
return std::make_pair(cur, ggml_validate_row_data(cur->type, cur->data, n_size));
|
3583
|
+
}));
|
3584
|
+
}
|
3541
3585
|
} else {
|
3542
|
-
read_buf.resize(
|
3586
|
+
read_buf.resize(n_size);
|
3543
3587
|
file->seek(weight->offs, SEEK_SET);
|
3544
|
-
file->read_raw(read_buf.data(),
|
3588
|
+
file->read_raw(read_buf.data(), n_size);
|
3545
3589
|
ggml_backend_tensor_set(cur, read_buf.data(), 0, n_size);
|
3590
|
+
if (check_tensors && !ggml_validate_row_data(cur->type, read_buf.data(), n_size)) {
|
3591
|
+
throw std::runtime_error(format("tensor '%s' has invalid data", ggml_get_name(cur)));
|
3592
|
+
}
|
3546
3593
|
}
|
3547
3594
|
}
|
3548
3595
|
|
3549
3596
|
size_done += n_size;
|
3550
3597
|
}
|
3551
3598
|
|
3599
|
+
// check validation results
|
3600
|
+
bool validation_failed = false;
|
3601
|
+
for (auto & future : validation_result) {
|
3602
|
+
auto result = future.get();
|
3603
|
+
if (!result.second) {
|
3604
|
+
LLAMA_LOG_ERROR("%s: tensor '%s' has invalid data\n", __func__, ggml_get_name(result.first));
|
3605
|
+
validation_failed = true;
|
3606
|
+
}
|
3607
|
+
}
|
3608
|
+
if (validation_failed) {
|
3609
|
+
throw std::runtime_error("found tensors with invalid data");
|
3610
|
+
}
|
3611
|
+
|
3552
3612
|
// check if this is the last call and do final cleanup
|
3553
3613
|
if (size_done >= size_data) {
|
3554
3614
|
// unmap offloaded tensors and metadata
|
@@ -4142,7 +4202,7 @@ static void llm_load_hparams(
|
|
4142
4202
|
model.ftype = ml.ftype;
|
4143
4203
|
|
4144
4204
|
if (hparams.f_max_alibi_bias > 0.0f) {
|
4145
|
-
hparams.
|
4205
|
+
hparams.use_alibi = true;
|
4146
4206
|
}
|
4147
4207
|
|
4148
4208
|
hparams.rope_type = llama_rope_type(&model);
|
@@ -4165,11 +4225,13 @@ static void llm_load_vocab(
|
|
4165
4225
|
|
4166
4226
|
// determine vocab type
|
4167
4227
|
{
|
4168
|
-
std::string
|
4228
|
+
std::string tokenizer_model;
|
4229
|
+
std::string tokenizer_pre;
|
4169
4230
|
|
4170
|
-
ml.get_key(LLM_KV_TOKENIZER_MODEL,
|
4231
|
+
ml.get_key(LLM_KV_TOKENIZER_MODEL, tokenizer_model);
|
4232
|
+
ml.get_key(LLM_KV_TOKENIZER_PRE, tokenizer_pre, false);
|
4171
4233
|
|
4172
|
-
if (
|
4234
|
+
if (tokenizer_model == "no_vocab") {
|
4173
4235
|
vocab.type = LLAMA_VOCAB_TYPE_NONE;
|
4174
4236
|
|
4175
4237
|
// default special tokens
|
@@ -4183,7 +4245,7 @@ static void llm_load_vocab(
|
|
4183
4245
|
vocab.linefeed_id = -1;
|
4184
4246
|
|
4185
4247
|
return;
|
4186
|
-
} else if (
|
4248
|
+
} else if (tokenizer_model == "llama") {
|
4187
4249
|
vocab.type = LLAMA_VOCAB_TYPE_SPM;
|
4188
4250
|
|
4189
4251
|
// default special tokens
|
@@ -4228,9 +4290,27 @@ static void llm_load_vocab(
|
|
4228
4290
|
if (add_space_prefix_keyidx != -1) {
|
4229
4291
|
vocab.add_space_prefix = gguf_get_val_bool(ctx, add_space_prefix_keyidx);
|
4230
4292
|
} // The default value of add_space_prefix is true.
|
4231
|
-
} else if (
|
4232
|
-
vocab.type =
|
4293
|
+
} else if (tokenizer_model == "bert") {
|
4294
|
+
vocab.type = LLAMA_VOCAB_TYPE_WPM;
|
4233
4295
|
|
4296
|
+
// default special tokens
|
4297
|
+
vocab.special_bos_id = -1;
|
4298
|
+
vocab.special_eos_id = -1;
|
4299
|
+
vocab.special_unk_id = 100;
|
4300
|
+
vocab.special_sep_id = 102;
|
4301
|
+
vocab.special_pad_id = 0;
|
4302
|
+
vocab.special_cls_id = 101;
|
4303
|
+
vocab.special_mask_id = 103;
|
4304
|
+
vocab.add_space_prefix = false;
|
4305
|
+
} else {
|
4306
|
+
if (tokenizer_model == "gpt2") {
|
4307
|
+
vocab.type = LLAMA_VOCAB_TYPE_BPE;
|
4308
|
+
} else {
|
4309
|
+
LLAMA_LOG_WARN("%s: unknown tokenizer: '%s'", __func__, tokenizer_model.c_str());
|
4310
|
+
LLAMA_LOG_WARN("%s: using default tokenizer: 'llama'", __func__);
|
4311
|
+
vocab.type = LLAMA_VOCAB_TYPE_SPM;
|
4312
|
+
return;
|
4313
|
+
}
|
4234
4314
|
// read bpe merges and populate bpe ranks
|
4235
4315
|
const int merges_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_MERGES).c_str());
|
4236
4316
|
if (merges_keyidx == -1) {
|
@@ -4264,23 +4344,50 @@ static void llm_load_vocab(
|
|
4264
4344
|
vocab.special_pad_id = -1;
|
4265
4345
|
vocab.special_cls_id = -1;
|
4266
4346
|
vocab.special_mask_id = -1;
|
4267
|
-
}
|
4268
|
-
vocab.type = LLAMA_VOCAB_TYPE_WPM;
|
4347
|
+
}
|
4269
4348
|
|
4270
|
-
|
4271
|
-
|
4272
|
-
|
4273
|
-
|
4274
|
-
|
4275
|
-
|
4276
|
-
|
4277
|
-
|
4278
|
-
|
4349
|
+
// for now, only BPE models have pre-tokenizers
|
4350
|
+
if (vocab.type == LLAMA_VOCAB_TYPE_BPE) {
|
4351
|
+
if (tokenizer_pre.empty()) {
|
4352
|
+
LLAMA_LOG_WARN("%s: missing pre-tokenizer type, using: 'default'\n", __func__);
|
4353
|
+
LLAMA_LOG_WARN("%s: \n", __func__);
|
4354
|
+
LLAMA_LOG_WARN("%s: ************************************ \n", __func__);
|
4355
|
+
LLAMA_LOG_WARN("%s: GENERATION QUALITY WILL BE DEGRADED! \n", __func__);
|
4356
|
+
LLAMA_LOG_WARN("%s: CONSIDER REGENERATING THE MODEL \n", __func__);
|
4357
|
+
LLAMA_LOG_WARN("%s: ************************************ \n", __func__);
|
4358
|
+
LLAMA_LOG_WARN("%s: \n", __func__);
|
4359
|
+
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
|
4360
|
+
} else if (
|
4361
|
+
tokenizer_pre == "default") {
|
4362
|
+
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
|
4363
|
+
} else if (
|
4364
|
+
tokenizer_pre == "llama3" ||
|
4365
|
+
tokenizer_pre == "llama-v3" ||
|
4366
|
+
tokenizer_pre == "llama-bpe") {
|
4367
|
+
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_LLAMA3;
|
4368
|
+
} else if (
|
4369
|
+
tokenizer_pre == "deepseek-llm") {
|
4370
|
+
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM;
|
4371
|
+
} else if (
|
4372
|
+
tokenizer_pre == "deepseek-coder") {
|
4373
|
+
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER;
|
4374
|
+
} else if (
|
4375
|
+
tokenizer_pre == "falcon") {
|
4376
|
+
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_FALCON;
|
4377
|
+
} else if (
|
4378
|
+
tokenizer_pre == "mpt") {
|
4379
|
+
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_MPT;
|
4380
|
+
} else if (
|
4381
|
+
tokenizer_pre == "starcoder") {
|
4382
|
+
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_STARCODER;
|
4383
|
+
} else if (
|
4384
|
+
tokenizer_pre == "gpt-2") {
|
4385
|
+
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_GPT2;
|
4386
|
+
} else {
|
4387
|
+
throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
|
4388
|
+
}
|
4279
4389
|
} else {
|
4280
|
-
|
4281
|
-
LLAMA_LOG_WARN("%s: using default tokenizer: 'llama'", __func__);
|
4282
|
-
|
4283
|
-
vocab.type = LLAMA_VOCAB_TYPE_SPM;
|
4390
|
+
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
|
4284
4391
|
}
|
4285
4392
|
}
|
4286
4393
|
|
@@ -5975,7 +6082,7 @@ static bool llm_load_tensors(
|
|
5975
6082
|
// Returns 0 on success, -1 on error, and -2 on cancellation via llama_progress_callback
|
5976
6083
|
static int llama_model_load(const std::string & fname, llama_model & model, llama_model_params & params) {
|
5977
6084
|
try {
|
5978
|
-
llama_model_loader ml(fname, params.use_mmap, params.kv_overrides);
|
6085
|
+
llama_model_loader ml(fname, params.use_mmap, params.check_tensors, params.kv_overrides);
|
5979
6086
|
|
5980
6087
|
model.hparams.vocab_only = params.vocab_only;
|
5981
6088
|
|
@@ -6104,37 +6211,47 @@ static struct ggml_tensor * llm_build_inp_embd(
|
|
6104
6211
|
static void llm_build_kv_store(
|
6105
6212
|
struct ggml_context * ctx,
|
6106
6213
|
const llama_hparams & hparams,
|
6214
|
+
const llama_cparams & cparams,
|
6107
6215
|
const llama_kv_cache & kv,
|
6108
6216
|
struct ggml_cgraph * graph,
|
6109
6217
|
struct ggml_tensor * k_cur,
|
6110
6218
|
struct ggml_tensor * v_cur,
|
6111
|
-
int64_t n_ctx,
|
6112
6219
|
int32_t n_tokens,
|
6113
6220
|
int32_t kv_head,
|
6114
6221
|
const llm_build_cb & cb,
|
6115
6222
|
int64_t il) {
|
6223
|
+
const int64_t n_ctx = cparams.n_ctx;
|
6224
|
+
|
6116
6225
|
const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa();
|
6117
6226
|
const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa();
|
6118
6227
|
|
6119
6228
|
GGML_ASSERT(kv.size == n_ctx);
|
6120
6229
|
|
6121
|
-
// compute the transposed [n_tokens, n_embd] V matrix
|
6122
|
-
assert(v_cur->ne[0] == n_embd_v_gqa && v_cur->ne[1] == n_tokens);
|
6123
|
-
struct ggml_tensor * v_cur_t = ggml_transpose(ctx, v_cur);
|
6124
|
-
cb(v_cur_t, "v_cur_t", il);
|
6125
|
-
|
6126
6230
|
struct ggml_tensor * k_cache_view = ggml_view_1d(ctx, kv.k_l[il], n_tokens*n_embd_k_gqa,
|
6127
6231
|
(ggml_row_size(kv.k_l[il]->type, n_embd_k_gqa))*kv_head);
|
6128
6232
|
cb(k_cache_view, "k_cache_view", il);
|
6129
6233
|
|
6130
|
-
|
6131
|
-
|
6132
|
-
|
6234
|
+
// note: storing RoPE-ed version of K in the KV cache
|
6235
|
+
ggml_build_forward_expand(graph, ggml_cpy(ctx, k_cur, k_cache_view));
|
6236
|
+
|
6237
|
+
assert(v_cur->ne[0] == n_embd_v_gqa && v_cur->ne[1] == n_tokens);
|
6238
|
+
|
6239
|
+
struct ggml_tensor * v_cache_view = nullptr;
|
6240
|
+
|
6241
|
+
if (cparams.flash_attn) {
|
6242
|
+
v_cache_view = ggml_view_1d(ctx, kv.v_l[il], n_tokens*n_embd_v_gqa,
|
6243
|
+
(kv_head)*ggml_row_size(kv.v_l[il]->type, n_embd_v_gqa));
|
6244
|
+
} else {
|
6245
|
+
// note: the V cache is transposed when not using flash attention
|
6246
|
+
v_cache_view = ggml_view_2d(ctx, kv.v_l[il], n_tokens, n_embd_v_gqa,
|
6247
|
+
( n_ctx)*ggml_element_size(kv.v_l[il]),
|
6248
|
+
(kv_head)*ggml_element_size(kv.v_l[il]));
|
6249
|
+
|
6250
|
+
v_cur = ggml_transpose(ctx, v_cur);
|
6251
|
+
}
|
6133
6252
|
cb(v_cache_view, "v_cache_view", il);
|
6134
6253
|
|
6135
|
-
|
6136
|
-
ggml_build_forward_expand(graph, ggml_cpy(ctx, k_cur, k_cache_view));
|
6137
|
-
ggml_build_forward_expand(graph, ggml_cpy(ctx, v_cur_t, v_cache_view));
|
6254
|
+
ggml_build_forward_expand(graph, ggml_cpy(ctx, v_cur, v_cache_view));
|
6138
6255
|
}
|
6139
6256
|
|
6140
6257
|
static struct ggml_tensor * llm_build_norm(
|
@@ -6354,11 +6471,11 @@ static struct ggml_tensor * llm_build_moe_ffn(
|
|
6354
6471
|
return moe_out;
|
6355
6472
|
}
|
6356
6473
|
|
6357
|
-
// if max_alibi_bias > 0 then apply ALiBi
|
6358
6474
|
static struct ggml_tensor * llm_build_kqv(
|
6359
6475
|
struct ggml_context * ctx,
|
6360
6476
|
const llama_model & model,
|
6361
6477
|
const llama_hparams & hparams,
|
6478
|
+
const llama_cparams & cparams,
|
6362
6479
|
const llama_kv_cache & kv,
|
6363
6480
|
struct ggml_cgraph * graph,
|
6364
6481
|
struct ggml_tensor * wo,
|
@@ -6366,12 +6483,12 @@ static struct ggml_tensor * llm_build_kqv(
|
|
6366
6483
|
struct ggml_tensor * q_cur,
|
6367
6484
|
struct ggml_tensor * kq_mask,
|
6368
6485
|
struct ggml_tensor * kq_pos,
|
6369
|
-
int64_t n_ctx,
|
6370
6486
|
int32_t n_tokens,
|
6371
6487
|
int32_t n_kv,
|
6372
6488
|
float kq_scale,
|
6373
6489
|
const llm_build_cb & cb,
|
6374
6490
|
int il) {
|
6491
|
+
const int64_t n_ctx = cparams.n_ctx;
|
6375
6492
|
const int64_t n_head = hparams.n_head;
|
6376
6493
|
const int64_t n_head_kv = hparams.n_head_kv;
|
6377
6494
|
const int64_t n_embd_head_k = hparams.n_embd_head_k;
|
@@ -6389,71 +6506,99 @@ static struct ggml_tensor * llm_build_kqv(
|
|
6389
6506
|
0);
|
6390
6507
|
cb(k, "k", il);
|
6391
6508
|
|
6392
|
-
struct ggml_tensor *
|
6393
|
-
cb(kq, "kq", il);
|
6509
|
+
struct ggml_tensor * cur;
|
6394
6510
|
|
6395
|
-
if (
|
6396
|
-
|
6397
|
-
|
6398
|
-
ggml_mul_mat_set_prec(kq, GGML_PREC_F32);
|
6399
|
-
}
|
6511
|
+
if (cparams.flash_attn) {
|
6512
|
+
GGML_UNUSED(model);
|
6513
|
+
GGML_UNUSED(n_ctx);
|
6400
6514
|
|
6401
|
-
|
6402
|
-
//
|
6403
|
-
|
6404
|
-
// and then :
|
6405
|
-
// kq = 30 * tanh(kq / 30)
|
6406
|
-
// before the softmax below
|
6515
|
+
// note: if this assert triggers, then some check has failed earlier
|
6516
|
+
// the idea is to detect during context creation that ALiBi would be used and disable Flash Attention
|
6517
|
+
GGML_ASSERT(kq_pos == nullptr && "ALiBi is not yet supported with Flash Attention");
|
6407
6518
|
|
6408
|
-
//
|
6409
|
-
|
6519
|
+
// split cached v into n_head heads (not transposed)
|
6520
|
+
struct ggml_tensor * v =
|
6521
|
+
ggml_view_3d(ctx, kv.v_l[il],
|
6522
|
+
n_embd_head_v, n_kv, n_head_kv,
|
6523
|
+
ggml_row_size(kv.v_l[il]->type, n_embd_k_gqa),
|
6524
|
+
ggml_row_size(kv.v_l[il]->type, n_embd_head_k),
|
6525
|
+
0);
|
6526
|
+
cb(v, "v", il);
|
6410
6527
|
|
6411
|
-
|
6412
|
-
|
6413
|
-
|
6528
|
+
cur = ggml_flash_attn_ext(ctx, q, k, v, kq_mask, kq_scale);
|
6529
|
+
|
6530
|
+
if (model.arch == LLM_ARCH_PHI2 || model.arch == LLM_ARCH_PHI3) {
|
6531
|
+
ggml_flash_attn_ext_set_prec(cur, GGML_PREC_F32);
|
6532
|
+
}
|
6533
|
+
|
6534
|
+
cur = ggml_reshape_2d(ctx, cur, n_embd_head_k*n_head, n_tokens);
|
6535
|
+
} else {
|
6536
|
+
struct ggml_tensor * kq = ggml_mul_mat(ctx, k, q);
|
6537
|
+
cb(kq, "kq", il);
|
6538
|
+
|
6539
|
+
if (model.arch == LLM_ARCH_PHI2 || model.arch == LLM_ARCH_PHI3) {
|
6540
|
+
// for this arch, we need to perform the KQ multiplication with F32 precision, otherwise we get NaNs
|
6541
|
+
// ref: https://github.com/ggerganov/llama.cpp/pull/4490#issuecomment-1859055847
|
6542
|
+
ggml_mul_mat_set_prec(kq, GGML_PREC_F32);
|
6543
|
+
}
|
6544
|
+
|
6545
|
+
if (model.arch == LLM_ARCH_GROK) {
|
6546
|
+
// need to do the following:
|
6547
|
+
// multiply by attn_output_multiplyer of 0.08838834764831845
|
6548
|
+
// and then :
|
6549
|
+
// kq = 30 * tanh(kq / 30)
|
6550
|
+
// before the softmax below
|
6551
|
+
|
6552
|
+
//try from phi2
|
6553
|
+
//ggml_mul_mat_set_prec(kq, GGML_PREC_F32);
|
6554
|
+
|
6555
|
+
kq = ggml_tanh(ctx, ggml_scale(ctx, kq, 0.08838834764831845f/30.0f));
|
6556
|
+
kq = ggml_scale(ctx, kq, 30);
|
6557
|
+
}
|
6414
6558
|
|
6415
6559
|
#if defined(GGML_USE_KOMPUTE)
|
6416
6560
|
#pragma message("TODO: ALiBi support in ggml_soft_max_ext is not implemented for Kompute")
|
6417
6561
|
#pragma message(" Falling back to ggml_alibi(). Will become an error in Mar 2024")
|
6418
6562
|
#pragma message("ref: https://github.com/ggerganov/llama.cpp/pull/5488")
|
6419
|
-
|
6420
|
-
|
6421
|
-
|
6563
|
+
if (hparams.use_alibi) {
|
6564
|
+
kq = ggml_scale(ctx, kq, kq_scale);
|
6565
|
+
cb(kq, "kq_scaled", il);
|
6422
6566
|
|
6423
|
-
|
6424
|
-
|
6567
|
+
kq = ggml_alibi(ctx, kq, /*n_past*/ 0, n_head, hparams.f_max_alibi_bias);
|
6568
|
+
cb(kq, "kq_scaled_alibi", il);
|
6425
6569
|
|
6426
|
-
|
6427
|
-
|
6570
|
+
kq = ggml_add(ctx, kq, kq_mask);
|
6571
|
+
cb(kq, "kq_masked", il);
|
6428
6572
|
|
6429
|
-
|
6430
|
-
|
6431
|
-
|
6573
|
+
kq = ggml_soft_max(ctx, kq);
|
6574
|
+
cb(kq, "kq_soft_max", il);
|
6575
|
+
} else
|
6432
6576
|
#endif
|
6433
|
-
|
6434
|
-
|
6435
|
-
|
6436
|
-
|
6577
|
+
{
|
6578
|
+
kq = ggml_soft_max_ext(ctx, kq, kq_mask, kq_pos, kq_scale, hparams.f_max_alibi_bias);
|
6579
|
+
cb(kq, "kq_soft_max_ext", il);
|
6580
|
+
}
|
6437
6581
|
|
6438
|
-
|
6582
|
+
GGML_ASSERT(kv.size == n_ctx);
|
6439
6583
|
|
6440
|
-
|
6441
|
-
|
6442
|
-
|
6443
|
-
|
6444
|
-
|
6445
|
-
|
6446
|
-
|
6447
|
-
|
6584
|
+
// split cached v into n_head heads
|
6585
|
+
struct ggml_tensor * v =
|
6586
|
+
ggml_view_3d(ctx, kv.v_l[il],
|
6587
|
+
n_kv, n_embd_head_v, n_head_kv,
|
6588
|
+
ggml_element_size(kv.v_l[il])*n_ctx,
|
6589
|
+
ggml_element_size(kv.v_l[il])*n_ctx*n_embd_head_v,
|
6590
|
+
0);
|
6591
|
+
cb(v, "v", il);
|
6448
6592
|
|
6449
|
-
|
6450
|
-
|
6593
|
+
struct ggml_tensor * kqv = ggml_mul_mat(ctx, v, kq);
|
6594
|
+
cb(kqv, "kqv", il);
|
6451
6595
|
|
6452
|
-
|
6453
|
-
|
6596
|
+
struct ggml_tensor * kqv_merged = ggml_permute(ctx, kqv, 0, 2, 1, 3);
|
6597
|
+
cb(kqv_merged, "kqv_merged", il);
|
6454
6598
|
|
6455
|
-
|
6456
|
-
|
6599
|
+
cur = ggml_cont_2d(ctx, kqv_merged, n_embd_head_k*n_head, n_tokens);
|
6600
|
+
cb(cur, "kqv_merged_cont", il);
|
6601
|
+
}
|
6457
6602
|
|
6458
6603
|
ggml_build_forward_expand(graph, cur);
|
6459
6604
|
|
@@ -6473,6 +6618,7 @@ static struct ggml_tensor * llm_build_kv(
|
|
6473
6618
|
struct ggml_context * ctx,
|
6474
6619
|
const llama_model & model,
|
6475
6620
|
const llama_hparams & hparams,
|
6621
|
+
const llama_cparams & cparams,
|
6476
6622
|
const llama_kv_cache & kv,
|
6477
6623
|
struct ggml_cgraph * graph,
|
6478
6624
|
struct ggml_tensor * wo,
|
@@ -6482,7 +6628,6 @@ static struct ggml_tensor * llm_build_kv(
|
|
6482
6628
|
struct ggml_tensor * q_cur,
|
6483
6629
|
struct ggml_tensor * kq_mask,
|
6484
6630
|
struct ggml_tensor * kq_pos,
|
6485
|
-
int64_t n_ctx,
|
6486
6631
|
int32_t n_tokens,
|
6487
6632
|
int32_t kv_head,
|
6488
6633
|
int32_t n_kv,
|
@@ -6496,12 +6641,12 @@ static struct ggml_tensor * llm_build_kv(
|
|
6496
6641
|
ggml_build_forward_expand(graph, k_cur);
|
6497
6642
|
ggml_build_forward_expand(graph, v_cur);
|
6498
6643
|
|
6499
|
-
llm_build_kv_store(ctx, hparams, kv, graph, k_cur, v_cur,
|
6644
|
+
llm_build_kv_store(ctx, hparams, cparams, kv, graph, k_cur, v_cur, n_tokens, kv_head, cb, il);
|
6500
6645
|
|
6501
6646
|
struct ggml_tensor * cur;
|
6502
6647
|
|
6503
|
-
cur = llm_build_kqv(ctx, model, hparams, kv, graph, wo, wo_b,
|
6504
|
-
q_cur, kq_mask, kq_pos,
|
6648
|
+
cur = llm_build_kqv(ctx, model, hparams, cparams, kv, graph, wo, wo_b,
|
6649
|
+
q_cur, kq_mask, kq_pos, n_tokens, n_kv, kq_scale, cb, il);
|
6505
6650
|
cb(cur, "kqv_out", il);
|
6506
6651
|
|
6507
6652
|
return cur;
|
@@ -6543,6 +6688,8 @@ struct llm_build_context {
|
|
6543
6688
|
const int32_t kv_head; // index of where we store new KV data in the cache
|
6544
6689
|
const int32_t n_orig_ctx;
|
6545
6690
|
|
6691
|
+
const bool flash_attn;
|
6692
|
+
|
6546
6693
|
const enum llama_pooling_type pooling_type;
|
6547
6694
|
const enum llama_rope_type rope_type;
|
6548
6695
|
|
@@ -6589,6 +6736,7 @@ struct llm_build_context {
|
|
6589
6736
|
n_outputs (worst_case ? n_tokens : lctx.n_outputs),
|
6590
6737
|
kv_head (worst_case ? (kv_self.recurrent ? 0 : kv_self.size - n_tokens) : kv_self.head),
|
6591
6738
|
n_orig_ctx (cparams.n_yarn_orig_ctx),
|
6739
|
+
flash_attn (cparams.flash_attn),
|
6592
6740
|
pooling_type (cparams.pooling_type),
|
6593
6741
|
rope_type (hparams.rope_type),
|
6594
6742
|
cb (cb),
|
@@ -6703,15 +6851,31 @@ struct llm_build_context {
|
|
6703
6851
|
ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa),
|
6704
6852
|
ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*id));
|
6705
6853
|
|
6706
|
-
ggml_tensor * view_v_src
|
6707
|
-
|
6708
|
-
ggml_row_size(kv_self.v_l[il]->type, kv_self.size),
|
6709
|
-
ggml_row_size(kv_self.v_l[il]->type, i));
|
6854
|
+
ggml_tensor * view_v_src;
|
6855
|
+
ggml_tensor * view_v_dst;
|
6710
6856
|
|
6711
|
-
|
6712
|
-
|
6713
|
-
|
6714
|
-
|
6857
|
+
if (flash_attn) {
|
6858
|
+
// NOTE: the V cache is not transposed when using flash attention
|
6859
|
+
view_v_src = ggml_view_2d(ctx0, kv_self.v_l[il],
|
6860
|
+
n_embd_v_gqa, nm,
|
6861
|
+
ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa),
|
6862
|
+
ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa*i));
|
6863
|
+
|
6864
|
+
view_v_dst = ggml_view_2d(ctx0, kv_self.v_l[il],
|
6865
|
+
n_embd_v_gqa, nm,
|
6866
|
+
ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa),
|
6867
|
+
ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa*id));
|
6868
|
+
} else {
|
6869
|
+
view_v_src = ggml_view_2d(ctx0, kv_self.v_l[il],
|
6870
|
+
nm, n_embd_v_gqa,
|
6871
|
+
ggml_row_size(kv_self.v_l[il]->type, kv_self.size),
|
6872
|
+
ggml_row_size(kv_self.v_l[il]->type, i));
|
6873
|
+
|
6874
|
+
view_v_dst = ggml_view_2d(ctx0, kv_self.v_l[il],
|
6875
|
+
nm, n_embd_v_gqa,
|
6876
|
+
ggml_row_size(kv_self.v_l[il]->type, kv_self.size),
|
6877
|
+
ggml_row_size(kv_self.v_l[il]->type, id));
|
6878
|
+
}
|
6715
6879
|
|
6716
6880
|
ggml_build_forward_expand(gf, ggml_cpy(ctx0, view_k_src, view_k_dst));
|
6717
6881
|
ggml_build_forward_expand(gf, ggml_cpy(ctx0, view_v_src, view_v_dst));
|
@@ -6741,20 +6905,26 @@ struct llm_build_context {
|
|
6741
6905
|
|
6742
6906
|
struct ggml_tensor * build_inp_KQ_mask(bool causal = true) {
|
6743
6907
|
if (causal) {
|
6744
|
-
lctx.inp_KQ_mask = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_kv,
|
6908
|
+
lctx.inp_KQ_mask = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_kv, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD));
|
6745
6909
|
} else {
|
6746
|
-
lctx.inp_KQ_mask = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_tokens, n_tokens);
|
6910
|
+
lctx.inp_KQ_mask = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_tokens, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD));
|
6747
6911
|
}
|
6748
6912
|
cb(lctx.inp_KQ_mask, "KQ_mask", -1);
|
6749
6913
|
ggml_set_input(lctx.inp_KQ_mask);
|
6750
|
-
return lctx.inp_KQ_mask;
|
6914
|
+
return flash_attn ? ggml_cast(ctx0, lctx.inp_KQ_mask, GGML_TYPE_F16) : lctx.inp_KQ_mask;
|
6751
6915
|
}
|
6752
6916
|
|
6753
|
-
struct ggml_tensor * build_inp_KQ_pos() {
|
6754
|
-
|
6917
|
+
struct ggml_tensor * build_inp_KQ_pos(bool causal = true) {
|
6918
|
+
if (causal) {
|
6919
|
+
lctx.inp_KQ_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, n_kv);
|
6920
|
+
} else {
|
6921
|
+
// TODO: this will be needed for ALiBi-based BERT models
|
6922
|
+
// https://github.com/ggerganov/llama.cpp/pull/6826
|
6923
|
+
lctx.inp_KQ_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, n_tokens);
|
6924
|
+
}
|
6755
6925
|
cb(lctx.inp_KQ_pos, "KQ_pos", -1);
|
6756
6926
|
ggml_set_input(lctx.inp_KQ_pos);
|
6757
|
-
return lctx.inp_KQ_pos;
|
6927
|
+
return flash_attn ? ggml_cast(ctx0, lctx.inp_KQ_pos, GGML_TYPE_F16) : lctx.inp_KQ_pos;
|
6758
6928
|
}
|
6759
6929
|
|
6760
6930
|
struct ggml_tensor * build_inp_mean() {
|
@@ -6860,9 +7030,9 @@ struct llm_build_context {
|
|
6860
7030
|
);
|
6861
7031
|
cb(Kcur, "Kcur", il);
|
6862
7032
|
|
6863
|
-
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
7033
|
+
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
6864
7034
|
model.layers[il].wo, model.layers[il].bo,
|
6865
|
-
Kcur, Vcur, Qcur, KQ_mask, nullptr,
|
7035
|
+
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
6866
7036
|
}
|
6867
7037
|
|
6868
7038
|
if (il == n_layer - 1) {
|
@@ -7000,9 +7170,9 @@ struct llm_build_context {
|
|
7000
7170
|
cb(Qcur, "Qcur", il);
|
7001
7171
|
cb(Kcur, "Kcur", il);
|
7002
7172
|
|
7003
|
-
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
7173
|
+
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
7004
7174
|
model.layers[il].wo, NULL,
|
7005
|
-
Kcur, Vcur, Qcur, KQ_mask, KQ_pos,
|
7175
|
+
Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
7006
7176
|
}
|
7007
7177
|
|
7008
7178
|
if (il == n_layer - 1) {
|
@@ -7107,9 +7277,9 @@ struct llm_build_context {
|
|
7107
7277
|
ext_factor, attn_factor, beta_fast, beta_slow
|
7108
7278
|
);
|
7109
7279
|
cb(Kcur, "Kcur", il);
|
7110
|
-
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
7280
|
+
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
7111
7281
|
model.layers[il].wo, NULL,
|
7112
|
-
Kcur, Vcur, Qcur, KQ_mask, KQ_pos,
|
7282
|
+
Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
7113
7283
|
}
|
7114
7284
|
|
7115
7285
|
if (il == n_layer - 1) {
|
@@ -7227,9 +7397,9 @@ struct llm_build_context {
|
|
7227
7397
|
);
|
7228
7398
|
cb(Kcur, "Kcur", il);
|
7229
7399
|
|
7230
|
-
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
7400
|
+
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
7231
7401
|
model.layers[il].wo, NULL,
|
7232
|
-
Kcur, Vcur, Qcur, KQ_mask, nullptr,
|
7402
|
+
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
7233
7403
|
}
|
7234
7404
|
|
7235
7405
|
if (il == n_layer - 1) {
|
@@ -7352,9 +7522,9 @@ struct llm_build_context {
|
|
7352
7522
|
);
|
7353
7523
|
cb(Kcur, "Kcur", il);
|
7354
7524
|
|
7355
|
-
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
7525
|
+
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
7356
7526
|
model.layers[il].wo, model.layers[il].bo,
|
7357
|
-
Kcur, Vcur, Qcur, KQ_mask, nullptr,
|
7527
|
+
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f, cb, il);
|
7358
7528
|
}
|
7359
7529
|
|
7360
7530
|
if (il == n_layer - 1) {
|
@@ -7504,9 +7674,9 @@ struct llm_build_context {
|
|
7504
7674
|
);
|
7505
7675
|
cb(Kcur, "Kcur", il);
|
7506
7676
|
|
7507
|
-
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
7508
|
-
|
7509
|
-
|
7677
|
+
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
7678
|
+
model.layers[il].wo, NULL,
|
7679
|
+
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
7510
7680
|
}
|
7511
7681
|
|
7512
7682
|
if (il == n_layer - 1) {
|
@@ -7616,9 +7786,9 @@ struct llm_build_context {
|
|
7616
7786
|
|
7617
7787
|
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
7618
7788
|
|
7619
|
-
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
7789
|
+
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
7620
7790
|
model.layers[il].wo, model.layers[il].bo,
|
7621
|
-
Kcur, Vcur, Qcur, KQ_mask, nullptr,
|
7791
|
+
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
7622
7792
|
}
|
7623
7793
|
|
7624
7794
|
if (il == n_layer - 1) {
|
@@ -7820,9 +7990,9 @@ struct llm_build_context {
|
|
7820
7990
|
);
|
7821
7991
|
cb(Vcur, "Vcur", il);
|
7822
7992
|
|
7823
|
-
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
7993
|
+
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
7824
7994
|
model.layers[il].wo, model.layers[il].bo,
|
7825
|
-
Kcur, Vcur, Q, KQ_mask, nullptr,
|
7995
|
+
Kcur, Vcur, Q, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
7826
7996
|
}
|
7827
7997
|
|
7828
7998
|
if (il == n_layer - 1) {
|
@@ -7916,9 +8086,9 @@ struct llm_build_context {
|
|
7916
8086
|
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
7917
8087
|
cb(Qcur, "Qcur", il);
|
7918
8088
|
|
7919
|
-
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
8089
|
+
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
7920
8090
|
model.layers[il].wo, NULL,
|
7921
|
-
Kcur, Vcur, Qcur, KQ_mask, KQ_pos,
|
8091
|
+
Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
7922
8092
|
}
|
7923
8093
|
|
7924
8094
|
if (il == n_layer - 1) {
|
@@ -8209,9 +8379,9 @@ struct llm_build_context {
|
|
8209
8379
|
|
8210
8380
|
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
8211
8381
|
|
8212
|
-
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
8382
|
+
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
8213
8383
|
model.layers[il].wo, model.layers[il].bo,
|
8214
|
-
Kcur, Vcur, Qcur, KQ_mask, KQ_pos,
|
8384
|
+
Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
8215
8385
|
}
|
8216
8386
|
|
8217
8387
|
if (il == n_layer - 1) {
|
@@ -8340,14 +8510,15 @@ struct llm_build_context {
|
|
8340
8510
|
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
8341
8511
|
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
8342
8512
|
|
8343
|
-
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
8344
|
-
|
8345
|
-
|
8513
|
+
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
8514
|
+
model.layers[il].wo, model.layers[il].bo,
|
8515
|
+
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
8346
8516
|
} else {
|
8347
8517
|
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
8348
|
-
|
8518
|
+
|
8519
|
+
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
8349
8520
|
model.layers[il].wo, model.layers[il].bo,
|
8350
|
-
Kcur, Vcur, Qcur, KQ_mask, KQ_pos,
|
8521
|
+
Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
8351
8522
|
}
|
8352
8523
|
}
|
8353
8524
|
|
@@ -8489,9 +8660,9 @@ struct llm_build_context {
|
|
8489
8660
|
);
|
8490
8661
|
cb(Kcur, "Kcur", il);
|
8491
8662
|
|
8492
|
-
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
8663
|
+
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
8493
8664
|
model.layers[il].wo, NULL,
|
8494
|
-
Kcur, Vcur, Qcur, KQ_mask, nullptr,
|
8665
|
+
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
8495
8666
|
}
|
8496
8667
|
|
8497
8668
|
if (il == n_layer - 1) {
|
@@ -8607,9 +8778,9 @@ struct llm_build_context {
|
|
8607
8778
|
);
|
8608
8779
|
cb(Kcur, "Kcur", il);
|
8609
8780
|
|
8610
|
-
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
8781
|
+
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
8611
8782
|
model.layers[il].wo, NULL,
|
8612
|
-
Kcur, Vcur, Qcur, KQ_mask, nullptr,
|
8783
|
+
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
8613
8784
|
}
|
8614
8785
|
|
8615
8786
|
if (il == n_layer - 1) {
|
@@ -8720,9 +8891,9 @@ struct llm_build_context {
|
|
8720
8891
|
);
|
8721
8892
|
cb(Kcur, "Kcur", il);
|
8722
8893
|
|
8723
|
-
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
8894
|
+
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
8724
8895
|
model.layers[il].wo, model.layers[il].bo,
|
8725
|
-
Kcur, Vcur, Qcur, KQ_mask, nullptr,
|
8896
|
+
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
8726
8897
|
}
|
8727
8898
|
|
8728
8899
|
if (il == n_layer - 1) {
|
@@ -8834,9 +9005,9 @@ struct llm_build_context {
|
|
8834
9005
|
);
|
8835
9006
|
cb(Kcur, "Kcur", il);
|
8836
9007
|
|
8837
|
-
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
9008
|
+
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
8838
9009
|
model.layers[il].wo, model.layers[il].bo,
|
8839
|
-
Kcur, Vcur, Qcur, KQ_mask, nullptr,
|
9010
|
+
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
8840
9011
|
}
|
8841
9012
|
|
8842
9013
|
if (il == n_layer - 1) {
|
@@ -8989,9 +9160,9 @@ struct llm_build_context {
|
|
8989
9160
|
);
|
8990
9161
|
cb(Kcur, "Kcur", il);
|
8991
9162
|
|
8992
|
-
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
9163
|
+
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
8993
9164
|
model.layers[il].wo, model.layers[il].bo,
|
8994
|
-
Kcur, Vcur, Qcur, KQ_mask, nullptr,
|
9165
|
+
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f, cb, il);
|
8995
9166
|
}
|
8996
9167
|
|
8997
9168
|
if (il == n_layer - 1) {
|
@@ -9106,9 +9277,9 @@ struct llm_build_context {
|
|
9106
9277
|
);
|
9107
9278
|
cb(Kcur, "Kcur", il);
|
9108
9279
|
|
9109
|
-
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
9110
|
-
|
9111
|
-
|
9280
|
+
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
9281
|
+
model.layers[il].wo, model.layers[il].bo,
|
9282
|
+
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f, cb, il);
|
9112
9283
|
}
|
9113
9284
|
|
9114
9285
|
if (il == n_layer - 1) {
|
@@ -9219,9 +9390,9 @@ struct llm_build_context {
|
|
9219
9390
|
ext_factor, attn_factor, beta_fast, beta_slow);
|
9220
9391
|
cb(Kcur, "Kcur", il);
|
9221
9392
|
|
9222
|
-
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
9393
|
+
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
9223
9394
|
model.layers[il].wo, NULL,
|
9224
|
-
Kcur, Vcur, Qcur, KQ_mask, nullptr,
|
9395
|
+
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
9225
9396
|
}
|
9226
9397
|
struct ggml_tensor * sa_out = cur;
|
9227
9398
|
|
@@ -9322,9 +9493,9 @@ struct llm_build_context {
|
|
9322
9493
|
|
9323
9494
|
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
9324
9495
|
|
9325
|
-
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
9496
|
+
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
9326
9497
|
model.layers[il].wo, model.layers[il].bo,
|
9327
|
-
Kcur, Vcur, Qcur, KQ_mask, nullptr,
|
9498
|
+
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
9328
9499
|
}
|
9329
9500
|
|
9330
9501
|
if (il == n_layer - 1) {
|
@@ -9429,9 +9600,9 @@ struct llm_build_context {
|
|
9429
9600
|
);
|
9430
9601
|
cb(Kcur, "Kcur", il);
|
9431
9602
|
|
9432
|
-
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
9603
|
+
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
9433
9604
|
model.layers[il].wo, model.layers[il].bo,
|
9434
|
-
Kcur, Vcur, Qcur, KQ_mask, nullptr,
|
9605
|
+
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
9435
9606
|
}
|
9436
9607
|
|
9437
9608
|
if (il == n_layer - 1) {
|
@@ -9545,9 +9716,9 @@ struct llm_build_context {
|
|
9545
9716
|
);
|
9546
9717
|
cb(Kcur, "Kcur", il);
|
9547
9718
|
|
9548
|
-
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
9719
|
+
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
9549
9720
|
model.layers[il].wo, NULL,
|
9550
|
-
Kcur, Vcur, Qcur, KQ_mask, nullptr,
|
9721
|
+
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
9551
9722
|
}
|
9552
9723
|
|
9553
9724
|
if (il == n_layer - 1) {
|
@@ -9662,9 +9833,9 @@ struct llm_build_context {
|
|
9662
9833
|
);
|
9663
9834
|
cb(Kcur, "Kcur", il);
|
9664
9835
|
|
9665
|
-
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
9836
|
+
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
9666
9837
|
model.layers[il].wo, model.layers[il].bo,
|
9667
|
-
Kcur, Vcur, Qcur, KQ_mask, nullptr,
|
9838
|
+
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
9668
9839
|
}
|
9669
9840
|
|
9670
9841
|
if (il == n_layer - 1) {
|
@@ -9792,9 +9963,9 @@ struct llm_build_context {
|
|
9792
9963
|
);
|
9793
9964
|
cb(Kcur, "Kcur", il);
|
9794
9965
|
|
9795
|
-
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
9966
|
+
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
9796
9967
|
model.layers[il].wo, model.layers[il].bo,
|
9797
|
-
Kcur, Vcur, Qcur, KQ_mask, nullptr,
|
9968
|
+
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
9798
9969
|
}
|
9799
9970
|
|
9800
9971
|
if (il == n_layer - 1) {
|
@@ -9913,9 +10084,9 @@ struct llm_build_context {
|
|
9913
10084
|
ext_factor, attn_factor, beta_fast, beta_slow);
|
9914
10085
|
cb(Kcur, "Kcur", il);
|
9915
10086
|
|
9916
|
-
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
10087
|
+
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
9917
10088
|
model.layers[il].wo, NULL,
|
9918
|
-
Kcur, Vcur, Qcur, KQ_mask, nullptr,
|
10089
|
+
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f, cb, il);
|
9919
10090
|
}
|
9920
10091
|
|
9921
10092
|
if (il == n_layer - 1) {
|
@@ -10032,9 +10203,9 @@ struct llm_build_context {
|
|
10032
10203
|
);
|
10033
10204
|
cb(Kcur, "Kcur", il);
|
10034
10205
|
|
10035
|
-
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
10206
|
+
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
10036
10207
|
model.layers[il].wo, model.layers[il].bo,
|
10037
|
-
Kcur, Vcur, Qcur, KQ_mask, nullptr,
|
10208
|
+
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
10038
10209
|
}
|
10039
10210
|
|
10040
10211
|
if (il == n_layer - 1) {
|
@@ -10322,9 +10493,9 @@ struct llm_build_context {
|
|
10322
10493
|
);
|
10323
10494
|
cb(Kcur, "Kcur", il);
|
10324
10495
|
|
10325
|
-
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
10496
|
+
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
10326
10497
|
model.layers[il].wo, model.layers[il].bo,
|
10327
|
-
Kcur, Vcur, Qcur, KQ_mask, nullptr,
|
10498
|
+
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
10328
10499
|
}
|
10329
10500
|
|
10330
10501
|
if (il == n_layer - 1) {
|
@@ -10453,9 +10624,9 @@ struct llm_build_context {
|
|
10453
10624
|
);
|
10454
10625
|
cb(Kcur, "Kcur", il);
|
10455
10626
|
|
10456
|
-
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
10627
|
+
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
10457
10628
|
model.layers[il].wo, nullptr,
|
10458
|
-
Kcur, Vcur, Qcur, KQ_mask, nullptr,
|
10629
|
+
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
10459
10630
|
}
|
10460
10631
|
|
10461
10632
|
if (il == n_layer - 1) {
|
@@ -10882,7 +11053,9 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
|
|
10882
11053
|
}
|
10883
11054
|
}
|
10884
11055
|
|
10885
|
-
|
11056
|
+
// ALiBi requires the KQ_pos tensor to provide the sequence position of each token in the batch
|
11057
|
+
// this allows to process multiple sequences in parallel with ALiBi-based models
|
11058
|
+
if (hparams.use_alibi) {
|
10886
11059
|
const int64_t n_kv = kv_self.n;
|
10887
11060
|
|
10888
11061
|
GGML_ASSERT(lctx.inp_KQ_pos);
|
@@ -11264,7 +11437,7 @@ static int llama_decode_internal(
|
|
11264
11437
|
// a heuristic, to avoid attending the full cache if it is not yet utilized
|
11265
11438
|
// after enough generations, the benefit from this heuristic disappears
|
11266
11439
|
// if we start defragmenting the cache, the benefit from this will be more important
|
11267
|
-
kv_self.n = std::min(kv_self.size, std::max(
|
11440
|
+
kv_self.n = std::min(kv_self.size, std::max(256u, GGML_PAD(llama_kv_cache_cell_max(kv_self), 256)));
|
11268
11441
|
//kv_self.n = llama_kv_cache_cell_max(kv_self);
|
11269
11442
|
}
|
11270
11443
|
}
|
@@ -11432,6 +11605,10 @@ static int llama_decode_internal(
|
|
11432
11605
|
}
|
11433
11606
|
}
|
11434
11607
|
|
11608
|
+
// Reset state for the next token before backend sync, to allow the CPU activities in the reset to
|
11609
|
+
// overlap with device computation.
|
11610
|
+
ggml_backend_sched_reset(lctx.sched);
|
11611
|
+
|
11435
11612
|
return 0;
|
11436
11613
|
}
|
11437
11614
|
|
@@ -11457,7 +11634,9 @@ static void llama_kv_cache_defrag_internal(struct llama_context & lctx) {
|
|
11457
11634
|
// each move requires 6*n_layer tensors (see build_defrag)
|
11458
11635
|
// - source view, destination view, copy operation
|
11459
11636
|
// - x2 for keys and values
|
11460
|
-
const uint32_t max_moves = LLAMA_MAX_NODES/(6*n_layer);
|
11637
|
+
//const uint32_t max_moves = LLAMA_MAX_NODES/(6*n_layer);
|
11638
|
+
// TODO: tmp fix https://github.com/ggerganov/llama.cpp/issues/6685#issuecomment-2057579516
|
11639
|
+
const uint32_t max_moves = (LLAMA_MAX_NODES - 2*n_layer)/(6*n_layer);
|
11461
11640
|
|
11462
11641
|
// determine which KV cells to move where
|
11463
11642
|
//
|
@@ -11781,7 +11960,7 @@ static uint8_t llama_token_to_byte(const llama_vocab& vocab, llama_token id) {
|
|
11781
11960
|
}
|
11782
11961
|
case LLAMA_VOCAB_TYPE_BPE: {
|
11783
11962
|
GGML_ASSERT(false);
|
11784
|
-
return unicode_utf8_to_byte(token_data.text);
|
11963
|
+
return unicode_utf8_to_byte(token_data.text); // TODO: why is this here after GGML_ASSERT?
|
11785
11964
|
}
|
11786
11965
|
case LLAMA_VOCAB_TYPE_WPM: {
|
11787
11966
|
GGML_ASSERT(false);
|
@@ -12003,7 +12182,79 @@ struct llm_tokenizer_bpe {
|
|
12003
12182
|
|
12004
12183
|
void tokenize(const std::string & text, std::vector<llama_vocab::id> & output) {
|
12005
12184
|
int final_prev_index = -1;
|
12006
|
-
|
12185
|
+
|
12186
|
+
std::vector<std::string> word_collection;
|
12187
|
+
switch (vocab.type) {
|
12188
|
+
case LLAMA_VOCAB_TYPE_BPE:
|
12189
|
+
switch (vocab.type_pre) {
|
12190
|
+
case LLAMA_VOCAB_PRE_TYPE_LLAMA3:
|
12191
|
+
word_collection = unicode_regex_split(text, {
|
12192
|
+
// original regex from tokenizer.json
|
12193
|
+
//"(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
|
12194
|
+
|
12195
|
+
// adapted: https://github.com/ggerganov/llama.cpp/pull/6920#issuecomment-2080233989
|
12196
|
+
"(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
|
12197
|
+
});
|
12198
|
+
break;
|
12199
|
+
case LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM:
|
12200
|
+
word_collection = unicode_regex_split(text, {
|
12201
|
+
"[\r\n]",
|
12202
|
+
"\\s?[A-Za-zµÀ-ÖØ-öø-ƺƼ-ƿDŽ-ʓʕ-ʯͰ-ͳͶͷͻ-ͽͿΆΈ-ΊΌΎ-ΡΣ-ϵϷ-ҁҊ-ԯԱ-ՖႠ-ჅᎠ-Ᏽᏸ-ᏽᲐ-ᲺᲽ-Ჿᴀ-ᴫᵫ-ᵷᵹ-ᶚḀ-ἕἘ-Ἕἠ-ὅὈ-Ὅὐ-ὗὙὛὝὟ-ώᾀ-ᾴᾶ-ᾼιῂ-ῄῆ-ῌῐ-ΐῖ-Ίῠ-Ῥῲ-ῴῶ-ῼℂℇℊ-ℓℕℙ-ℝℤΩℨK-ℭℯ-ℴℹℼ-ℿⅅ-ⅉⅎↃↄⰀ-ⱻⱾ-ⳤⳫ-ⳮⳲⳳꙀ-ꙭꚀ-ꚛꜢ-ꝯꝱ-ꞇꞋ-ꞎꭰ-ꮿff-stﬓ-ﬗA-Za-z𐐀-𐑏𐒰-𐓓𐓘-𐓻𐲀-𐲲𐳀-𐳲𑢠-𑣟𞤀-𞥃]+",
|
12203
|
+
"\\s?[!-/:-~!-/:-~‘-‟ -。]+",
|
12204
|
+
"\\s+$",
|
12205
|
+
"[一-龥ࠀ-一가-]+",
|
12206
|
+
"\\p{N}+",
|
12207
|
+
});
|
12208
|
+
break;
|
12209
|
+
case LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER:
|
12210
|
+
word_collection = unicode_regex_split(text, {
|
12211
|
+
"[\r\n]",
|
12212
|
+
"\\s?\\p{L}+",
|
12213
|
+
"\\s?\\p{P}+",
|
12214
|
+
"[一-龥ࠀ-一가-]+",
|
12215
|
+
"\\p{N}+",
|
12216
|
+
});
|
12217
|
+
break;
|
12218
|
+
case LLAMA_VOCAB_PRE_TYPE_FALCON:
|
12219
|
+
word_collection = unicode_regex_split(text, {
|
12220
|
+
"[\\p{P}\\$\\+<=>\\^~\\|]+",
|
12221
|
+
"'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
|
12222
|
+
"\\p{N}+",
|
12223
|
+
"[0-9][0-9][0-9]",
|
12224
|
+
});
|
12225
|
+
break;
|
12226
|
+
case LLAMA_VOCAB_PRE_TYPE_MPT:
|
12227
|
+
// TODO: MPT pre-tokenization regexes are unknown
|
12228
|
+
// the following are close, but not exact. run the following:
|
12229
|
+
// ./bin/test-tokenizer-0 ../models/ggml-vocab-mpt.gguf
|
12230
|
+
GGML_ASSERT("MPT pre-tokenization regexes are unknown - fixes needed");
|
12231
|
+
word_collection = unicode_regex_split(text, {
|
12232
|
+
"\\s?\\p{L}+",
|
12233
|
+
"\\s?\\p{P}+",
|
12234
|
+
"'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
|
12235
|
+
});
|
12236
|
+
break;
|
12237
|
+
case LLAMA_VOCAB_PRE_TYPE_STARCODER:
|
12238
|
+
case LLAMA_VOCAB_PRE_TYPE_GPT2:
|
12239
|
+
word_collection = unicode_regex_split(text, {
|
12240
|
+
"'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
|
12241
|
+
});
|
12242
|
+
break;
|
12243
|
+
default:
|
12244
|
+
// default regex for BPE tokenization pre-processing
|
12245
|
+
word_collection = unicode_regex_split(text, {
|
12246
|
+
"[\\p{P}\\$\\+<=>\\^~\\|]+",
|
12247
|
+
"'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
|
12248
|
+
"\\p{N}+",
|
12249
|
+
"[0-9][0-9][0-9]",
|
12250
|
+
});
|
12251
|
+
break;
|
12252
|
+
}
|
12253
|
+
break;
|
12254
|
+
default:
|
12255
|
+
GGML_ASSERT(false);
|
12256
|
+
break;
|
12257
|
+
}
|
12007
12258
|
|
12008
12259
|
symbols_final.clear();
|
12009
12260
|
|
@@ -12130,145 +12381,6 @@ private:
|
|
12130
12381
|
work_queue.push(bigram);
|
12131
12382
|
}
|
12132
12383
|
|
12133
|
-
std::vector<std::string> bpe_gpt2_preprocess(const std::string & text) {
|
12134
|
-
std::vector<std::string> bpe_words;
|
12135
|
-
std::vector<std::string> bpe_encoded_words;
|
12136
|
-
|
12137
|
-
std::string token = "";
|
12138
|
-
// GPT2 system regex: 's|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+
|
12139
|
-
bool collecting_numeric = false;
|
12140
|
-
bool collecting_letter = false;
|
12141
|
-
bool collecting_special = false;
|
12142
|
-
bool collecting_whitespace_lookahead = false;
|
12143
|
-
bool collecting = false;
|
12144
|
-
|
12145
|
-
std::vector<std::string> text_utf;
|
12146
|
-
text_utf.reserve(text.size());
|
12147
|
-
bpe_words.reserve(text.size());
|
12148
|
-
bpe_encoded_words.reserve(text.size());
|
12149
|
-
|
12150
|
-
const auto cpts = unicode_cpts_from_utf8(text);
|
12151
|
-
for (size_t i = 0; i < cpts.size(); ++i)
|
12152
|
-
text_utf.emplace_back(unicode_cpt_to_utf8(cpts[i]));
|
12153
|
-
|
12154
|
-
for (int i = 0; i < (int)text_utf.size(); i++) {
|
12155
|
-
const std::string & utf_char = text_utf[i];
|
12156
|
-
bool split_condition = false;
|
12157
|
-
int bytes_remain = text_utf.size() - i;
|
12158
|
-
// forward backward lookups
|
12159
|
-
const std::string & utf_char_next = (i + 1 < (int)text_utf.size()) ? text_utf[i + 1] : "";
|
12160
|
-
const std::string & utf_char_next_next = (i + 2 < (int)text_utf.size()) ? text_utf[i + 2] : "";
|
12161
|
-
|
12162
|
-
// handling contractions
|
12163
|
-
if (!split_condition && bytes_remain >= 2) {
|
12164
|
-
// 's|'t|'m|'d
|
12165
|
-
if (utf_char == "\'" && (utf_char_next == "s" || utf_char_next == "t" || utf_char_next == "m" || utf_char_next == "d")) {
|
12166
|
-
split_condition = true;
|
12167
|
-
}
|
12168
|
-
if (split_condition) {
|
12169
|
-
if (token.size()) {
|
12170
|
-
bpe_words.emplace_back(token); // push previous content as token
|
12171
|
-
}
|
12172
|
-
token = utf_char + utf_char_next;
|
12173
|
-
bpe_words.emplace_back(token);
|
12174
|
-
token = "";
|
12175
|
-
i++;
|
12176
|
-
continue;
|
12177
|
-
}
|
12178
|
-
}
|
12179
|
-
if (!split_condition && bytes_remain >= 3) {
|
12180
|
-
// 're|'ve|'ll
|
12181
|
-
if (utf_char == "\'" && (
|
12182
|
-
(utf_char_next == "r" && utf_char_next_next == "e") ||
|
12183
|
-
(utf_char_next == "v" && utf_char_next_next == "e") ||
|
12184
|
-
(utf_char_next == "l" && utf_char_next_next == "l"))
|
12185
|
-
) {
|
12186
|
-
split_condition = true;
|
12187
|
-
}
|
12188
|
-
if (split_condition) {
|
12189
|
-
// current token + next token can be defined
|
12190
|
-
if (token.size()) {
|
12191
|
-
bpe_words.emplace_back(token); // push previous content as token
|
12192
|
-
}
|
12193
|
-
token = utf_char + utf_char_next + utf_char_next_next;
|
12194
|
-
bpe_words.emplace_back(token); // the contraction
|
12195
|
-
token = "";
|
12196
|
-
i += 2;
|
12197
|
-
continue;
|
12198
|
-
}
|
12199
|
-
}
|
12200
|
-
|
12201
|
-
if (!split_condition && !collecting) {
|
12202
|
-
if (unicode_cpt_type(utf_char) == CODEPOINT_TYPE_LETTER || (!token.size() && utf_char == " " && unicode_cpt_type(utf_char_next) == CODEPOINT_TYPE_LETTER)) {
|
12203
|
-
collecting_letter = true;
|
12204
|
-
collecting = true;
|
12205
|
-
}
|
12206
|
-
else if (unicode_cpt_type(utf_char) == CODEPOINT_TYPE_DIGIT || (!token.size() && utf_char == " " && unicode_cpt_type(utf_char_next) == CODEPOINT_TYPE_DIGIT)) {
|
12207
|
-
collecting_numeric = true;
|
12208
|
-
collecting = true;
|
12209
|
-
}
|
12210
|
-
else if (
|
12211
|
-
((unicode_cpt_type(utf_char) != CODEPOINT_TYPE_LETTER && unicode_cpt_type(utf_char) != CODEPOINT_TYPE_DIGIT) && (unicode_cpt_type(utf_char) != CODEPOINT_TYPE_WHITESPACE)) ||
|
12212
|
-
(!token.size() && utf_char == " " && unicode_cpt_type(utf_char_next) != CODEPOINT_TYPE_LETTER && unicode_cpt_type(utf_char_next) != CODEPOINT_TYPE_DIGIT && unicode_cpt_type(utf_char_next) != CODEPOINT_TYPE_WHITESPACE)
|
12213
|
-
) {
|
12214
|
-
collecting_special = true;
|
12215
|
-
collecting = true;
|
12216
|
-
}
|
12217
|
-
else if (unicode_cpt_type(utf_char) == CODEPOINT_TYPE_WHITESPACE && unicode_cpt_type(utf_char_next) == CODEPOINT_TYPE_WHITESPACE) {
|
12218
|
-
collecting_whitespace_lookahead = true;
|
12219
|
-
collecting = true;
|
12220
|
-
}
|
12221
|
-
else if (unicode_cpt_type(utf_char) == CODEPOINT_TYPE_WHITESPACE) {
|
12222
|
-
split_condition = true;
|
12223
|
-
}
|
12224
|
-
}
|
12225
|
-
else if (!split_condition && collecting) {
|
12226
|
-
if (collecting_letter && unicode_cpt_type(utf_char) != CODEPOINT_TYPE_LETTER) {
|
12227
|
-
split_condition = true;
|
12228
|
-
}
|
12229
|
-
else if (collecting_numeric && unicode_cpt_type(utf_char) != CODEPOINT_TYPE_DIGIT) {
|
12230
|
-
split_condition = true;
|
12231
|
-
}
|
12232
|
-
else if (collecting_special && (unicode_cpt_type(utf_char) == CODEPOINT_TYPE_LETTER || unicode_cpt_type(utf_char) == CODEPOINT_TYPE_DIGIT || unicode_cpt_type(utf_char) == CODEPOINT_TYPE_WHITESPACE)) {
|
12233
|
-
split_condition = true;
|
12234
|
-
}
|
12235
|
-
else if (collecting_whitespace_lookahead && (unicode_cpt_type(utf_char_next) == CODEPOINT_TYPE_LETTER || unicode_cpt_type(utf_char_next) == CODEPOINT_TYPE_DIGIT)) {
|
12236
|
-
split_condition = true;
|
12237
|
-
}
|
12238
|
-
}
|
12239
|
-
|
12240
|
-
if (utf_char_next == "") {
|
12241
|
-
split_condition = true; // final
|
12242
|
-
token += utf_char;
|
12243
|
-
}
|
12244
|
-
|
12245
|
-
if (split_condition) {
|
12246
|
-
if (token.size()) {
|
12247
|
-
bpe_words.emplace_back(token);
|
12248
|
-
}
|
12249
|
-
token = utf_char;
|
12250
|
-
collecting = false;
|
12251
|
-
collecting_letter = false;
|
12252
|
-
collecting_numeric = false;
|
12253
|
-
collecting_special = false;
|
12254
|
-
collecting_whitespace_lookahead = false;
|
12255
|
-
}
|
12256
|
-
else {
|
12257
|
-
token += utf_char;
|
12258
|
-
}
|
12259
|
-
}
|
12260
|
-
|
12261
|
-
for (std::string & word : bpe_words) {
|
12262
|
-
std::string encoded_token = "";
|
12263
|
-
for (char & c : word) {
|
12264
|
-
encoded_token += unicode_byte_to_utf8(c);
|
12265
|
-
}
|
12266
|
-
bpe_encoded_words.emplace_back(encoded_token);
|
12267
|
-
}
|
12268
|
-
|
12269
|
-
return bpe_encoded_words;
|
12270
|
-
}
|
12271
|
-
|
12272
12384
|
const llama_vocab & vocab;
|
12273
12385
|
|
12274
12386
|
std::vector<llm_symbol> symbols;
|
@@ -12588,7 +12700,7 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
|
|
12588
12700
|
} break;
|
12589
12701
|
case LLAMA_VOCAB_TYPE_BPE:
|
12590
12702
|
{
|
12591
|
-
if (add_special && vocab.special_add_bos
|
12703
|
+
if (add_special && vocab.special_add_bos != 0) {
|
12592
12704
|
GGML_ASSERT(vocab.special_bos_id != -1);
|
12593
12705
|
output.push_back(vocab.special_bos_id);
|
12594
12706
|
}
|
@@ -14360,14 +14472,20 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
|
|
14360
14472
|
}
|
14361
14473
|
|
14362
14474
|
static size_t llama_tensor_quantize_internal(enum ggml_type new_type, const float * f32_data, void * new_data, const int64_t chunk_size, int64_t nrows, int64_t n_per_row, const float * imatrix, std::vector<std::thread> & workers, const int nthread) {
|
14363
|
-
std::mutex mutex;
|
14364
|
-
int64_t counter = 0;
|
14365
|
-
size_t new_size = 0;
|
14366
14475
|
if (nthread < 2) {
|
14367
14476
|
// single-thread
|
14368
|
-
|
14477
|
+
size_t new_size = ggml_quantize_chunk(new_type, f32_data, new_data, 0, nrows, n_per_row, imatrix);
|
14478
|
+
if (!ggml_validate_row_data(new_type, new_data, new_size)) {
|
14479
|
+
throw std::runtime_error("quantized data validation failed");
|
14480
|
+
}
|
14481
|
+
return new_size;
|
14369
14482
|
}
|
14370
|
-
|
14483
|
+
|
14484
|
+
std::mutex mutex;
|
14485
|
+
int64_t counter = 0;
|
14486
|
+
size_t new_size = 0;
|
14487
|
+
bool valid = true;
|
14488
|
+
auto compute = [&mutex, &counter, &new_size, &valid, new_type, f32_data, new_data, chunk_size,
|
14371
14489
|
nrows, n_per_row, imatrix]() {
|
14372
14490
|
const int64_t nrows_per_chunk = chunk_size / n_per_row;
|
14373
14491
|
size_t local_size = 0;
|
@@ -14382,7 +14500,17 @@ static size_t llama_tensor_quantize_internal(enum ggml_type new_type, const floa
|
|
14382
14500
|
}
|
14383
14501
|
lock.unlock();
|
14384
14502
|
const int64_t this_nrow = std::min(nrows - first_row, nrows_per_chunk);
|
14385
|
-
|
14503
|
+
size_t this_size = ggml_quantize_chunk(new_type, f32_data, new_data, first_row * n_per_row, this_nrow, n_per_row, imatrix);
|
14504
|
+
local_size += this_size;
|
14505
|
+
|
14506
|
+
// validate the quantized data
|
14507
|
+
const size_t row_size = ggml_row_size(new_type, n_per_row);
|
14508
|
+
void * this_data = (char *) new_data + first_row * row_size;
|
14509
|
+
if (!ggml_validate_row_data(new_type, this_data, this_size)) {
|
14510
|
+
std::unique_lock<std::mutex> lock(mutex);
|
14511
|
+
valid = false;
|
14512
|
+
break;
|
14513
|
+
}
|
14386
14514
|
}
|
14387
14515
|
};
|
14388
14516
|
for (int it = 0; it < nthread - 1; ++it) {
|
@@ -14391,6 +14519,9 @@ static size_t llama_tensor_quantize_internal(enum ggml_type new_type, const floa
|
|
14391
14519
|
compute();
|
14392
14520
|
for (auto & w : workers) { w.join(); }
|
14393
14521
|
workers.clear();
|
14522
|
+
if (!valid) {
|
14523
|
+
throw std::runtime_error("quantized data validation failed");
|
14524
|
+
}
|
14394
14525
|
return new_size;
|
14395
14526
|
}
|
14396
14527
|
|
@@ -14453,7 +14584,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
14453
14584
|
auto v = (std::vector<llama_model_kv_override>*)params->kv_overrides;
|
14454
14585
|
kv_overrides = v->data();
|
14455
14586
|
}
|
14456
|
-
llama_model_loader ml(fname_inp, use_mmap, kv_overrides);
|
14587
|
+
llama_model_loader ml(fname_inp, use_mmap, /*check_tensors*/ true, kv_overrides);
|
14457
14588
|
ml.init_mappings(false); // no prefetching
|
14458
14589
|
|
14459
14590
|
llama_model model;
|
@@ -14491,11 +14622,13 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
14491
14622
|
for (auto & o : overrides) {
|
14492
14623
|
if (o.key[0] == 0) break;
|
14493
14624
|
if (o.tag == LLAMA_KV_OVERRIDE_TYPE_FLOAT) {
|
14494
|
-
gguf_set_val_f32(ctx_out, o.key, o.
|
14625
|
+
gguf_set_val_f32(ctx_out, o.key, o.val_f64);
|
14495
14626
|
} else if (o.tag == LLAMA_KV_OVERRIDE_TYPE_INT) {
|
14496
|
-
gguf_set_val_i32(ctx_out, o.key, o.
|
14627
|
+
gguf_set_val_i32(ctx_out, o.key, o.val_i64);
|
14497
14628
|
} else if (o.tag == LLAMA_KV_OVERRIDE_TYPE_BOOL) {
|
14498
|
-
gguf_set_val_bool(ctx_out, o.key, o.
|
14629
|
+
gguf_set_val_bool(ctx_out, o.key, o.val_bool);
|
14630
|
+
} else if (o.tag == LLAMA_KV_OVERRIDE_TYPE_STR) {
|
14631
|
+
gguf_set_val_str(ctx_out, o.key, o.val_str);
|
14499
14632
|
} else {
|
14500
14633
|
LLAMA_LOG_WARN("%s: unknown KV override type for key %s\n", __func__, o.key);
|
14501
14634
|
}
|
@@ -14814,7 +14947,7 @@ static int llama_apply_lora_from_file_internal(
|
|
14814
14947
|
std::unique_ptr<llama_model_loader> ml;
|
14815
14948
|
if (path_base_model) {
|
14816
14949
|
LLAMA_LOG_INFO("%s: loading base model from '%s'\n", __func__, path_base_model);
|
14817
|
-
ml.reset(new llama_model_loader(path_base_model, /*use_mmap*/ true, /*kv_overrides*/ nullptr));
|
14950
|
+
ml.reset(new llama_model_loader(path_base_model, /*use_mmap*/ true, /*check_tensors*/ false, /*kv_overrides*/ nullptr));
|
14818
14951
|
ml->init_mappings(/*prefetch*/ false); // no prefetching
|
14819
14952
|
}
|
14820
14953
|
|
@@ -15073,6 +15206,7 @@ struct llama_model_params llama_model_default_params() {
|
|
15073
15206
|
/*.vocab_only =*/ false,
|
15074
15207
|
/*.use_mmap =*/ true,
|
15075
15208
|
/*.use_mlock =*/ false,
|
15209
|
+
/*.check_tensors =*/ false,
|
15076
15210
|
};
|
15077
15211
|
|
15078
15212
|
#ifdef GGML_USE_METAL
|
@@ -15109,6 +15243,7 @@ struct llama_context_params llama_context_default_params() {
|
|
15109
15243
|
/*.logits_all =*/ false,
|
15110
15244
|
/*.embeddings =*/ false,
|
15111
15245
|
/*.offload_kqv =*/ true,
|
15246
|
+
/*.flash_attn =*/ false,
|
15112
15247
|
/*.abort_callback =*/ nullptr,
|
15113
15248
|
/*.abort_callback_data =*/ nullptr,
|
15114
15249
|
};
|
@@ -15275,6 +15410,7 @@ struct llama_context * llama_new_context_with_model(
|
|
15275
15410
|
cparams.defrag_thold = params.defrag_thold;
|
15276
15411
|
cparams.embeddings = params.embeddings;
|
15277
15412
|
cparams.offload_kqv = params.offload_kqv;
|
15413
|
+
cparams.flash_attn = params.flash_attn;
|
15278
15414
|
cparams.pooling_type = params.pooling_type;
|
15279
15415
|
|
15280
15416
|
cparams.n_ctx = params.n_ctx == 0 ? hparams.n_ctx_train : params.n_ctx;
|
@@ -15282,12 +15418,20 @@ struct llama_context * llama_new_context_with_model(
|
|
15282
15418
|
cparams.rope_freq_scale = params.rope_freq_scale == 0.0f ? hparams.rope_freq_scale_train : params.rope_freq_scale;
|
15283
15419
|
|
15284
15420
|
// this is necessary due to kv_self.n being padded later during inference
|
15285
|
-
cparams.n_ctx
|
15421
|
+
cparams.n_ctx = GGML_PAD(cparams.n_ctx, 256);
|
15286
15422
|
|
15287
15423
|
// with causal attention, the batch size is limited by the context size
|
15288
15424
|
cparams.n_batch = hparams.causal_attn ? std::min(cparams.n_ctx, params.n_batch) : params.n_batch;
|
15289
|
-
cparams.n_ubatch = std::min(cparams.n_batch, params.n_ubatch == 0 ? params.n_batch : params.n_ubatch);
|
15290
15425
|
|
15426
|
+
// the batch has to be at least GGML_KQ_MASK_PAD because we will be padding the KQ_mask
|
15427
|
+
// this is required by GPU kernels in order to avoid out-of-bounds accesses (e.g. ggml_flash_attn_ext)
|
15428
|
+
// ref: https://github.com/ggerganov/llama.cpp/pull/5021
|
15429
|
+
if (cparams.n_batch < GGML_KQ_MASK_PAD) {
|
15430
|
+
LLAMA_LOG_WARN("%s: n_batch is less than GGML_KQ_MASK_PAD - increasing to %d\n", __func__, GGML_KQ_MASK_PAD);
|
15431
|
+
cparams.n_batch = GGML_KQ_MASK_PAD;
|
15432
|
+
}
|
15433
|
+
|
15434
|
+
cparams.n_ubatch = std::min(cparams.n_batch, params.n_ubatch == 0 ? params.n_batch : params.n_ubatch);
|
15291
15435
|
|
15292
15436
|
cparams.n_yarn_orig_ctx = params.yarn_orig_ctx != 0 ? params.yarn_orig_ctx :
|
15293
15437
|
hparams.n_yarn_orig_ctx != 0 ? hparams.n_yarn_orig_ctx :
|
@@ -15319,6 +15463,23 @@ struct llama_context * llama_new_context_with_model(
|
|
15319
15463
|
}
|
15320
15464
|
}
|
15321
15465
|
|
15466
|
+
if (cparams.flash_attn && hparams.use_alibi) {
|
15467
|
+
LLAMA_LOG_WARN("%s: flash_attn is not yet compatible with ALiBi - forcing off\n", __func__);
|
15468
|
+
cparams.flash_attn = false;
|
15469
|
+
}
|
15470
|
+
|
15471
|
+
if (cparams.flash_attn && model->arch == LLM_ARCH_GROK) {
|
15472
|
+
LLAMA_LOG_WARN("%s: flash_attn is not compatible with Grok - forcing off\n", __func__);
|
15473
|
+
cparams.flash_attn = false;
|
15474
|
+
}
|
15475
|
+
|
15476
|
+
#ifdef GGML_USE_HIPBLAS
|
15477
|
+
if (cparams.flash_attn) {
|
15478
|
+
LLAMA_LOG_WARN("%s: flash_attn is not yet compatible with HIPBLAS builds - forcing off\n", __func__);
|
15479
|
+
cparams.flash_attn = false;
|
15480
|
+
}
|
15481
|
+
#endif
|
15482
|
+
|
15322
15483
|
if (params.seed == LLAMA_DEFAULT_SEED) {
|
15323
15484
|
params.seed = time(NULL);
|
15324
15485
|
}
|
@@ -15326,6 +15487,7 @@ struct llama_context * llama_new_context_with_model(
|
|
15326
15487
|
LLAMA_LOG_INFO("%s: n_ctx = %u\n", __func__, cparams.n_ctx);
|
15327
15488
|
LLAMA_LOG_INFO("%s: n_batch = %u\n", __func__, cparams.n_batch);
|
15328
15489
|
LLAMA_LOG_INFO("%s: n_ubatch = %u\n", __func__, cparams.n_ubatch);
|
15490
|
+
LLAMA_LOG_INFO("%s: flash_attn = %d\n", __func__, cparams.flash_attn);
|
15329
15491
|
LLAMA_LOG_INFO("%s: freq_base = %.1f\n", __func__, cparams.rope_freq_base);
|
15330
15492
|
LLAMA_LOG_INFO("%s: freq_scale = %g\n", __func__, cparams.rope_freq_scale);
|
15331
15493
|
|
@@ -15454,7 +15616,7 @@ struct llama_context * llama_new_context_with_model(
|
|
15454
15616
|
}
|
15455
15617
|
ctx->backends.push_back(ctx->backend_cpu);
|
15456
15618
|
|
15457
|
-
if (!llama_kv_cache_init(ctx->kv_self, ctx
|
15619
|
+
if (!llama_kv_cache_init(ctx->kv_self, ctx, type_k, type_v, kv_size, cparams.offload_kqv)) {
|
15458
15620
|
LLAMA_LOG_ERROR("%s: llama_kv_cache_init() failed for self-attention cache\n", __func__);
|
15459
15621
|
llama_free(ctx);
|
15460
15622
|
return nullptr;
|
@@ -16053,6 +16215,7 @@ size_t llama_state_get_size(const struct llama_context * ctx) {
|
|
16053
16215
|
const size_t s_kv_head = sizeof(uint32_t);
|
16054
16216
|
const size_t s_kv_size = sizeof(uint32_t);
|
16055
16217
|
const size_t s_kv_used = sizeof(uint32_t);
|
16218
|
+
const size_t s_v_trans = sizeof(uint32_t);
|
16056
16219
|
const size_t s_kv = ctx->kv_self.total_size();
|
16057
16220
|
const size_t s_kv_cell = sizeof(llama_pos) + sizeof(size_t) + cparams.n_seq_max*sizeof(llama_seq_id);
|
16058
16221
|
const size_t s_kv_cells = ctx->kv_self.size * s_kv_cell;
|
@@ -16070,10 +16233,14 @@ size_t llama_state_get_size(const struct llama_context * ctx) {
|
|
16070
16233
|
+ s_kv_head
|
16071
16234
|
+ s_kv_size
|
16072
16235
|
+ s_kv_used
|
16236
|
+
+ s_v_trans
|
16073
16237
|
+ s_kv
|
16074
16238
|
+ s_kv_cells
|
16075
16239
|
);
|
16076
16240
|
|
16241
|
+
// on session change it is very likely that the state size has changed - so we need to update this function
|
16242
|
+
static_assert(LLAMA_SESSION_VERSION == 6, "So you just bumped the session version - good. But did you remember to update llama_state_get_size?");
|
16243
|
+
|
16077
16244
|
return s_total;
|
16078
16245
|
}
|
16079
16246
|
|
@@ -16219,11 +16386,13 @@ static void llama_state_get_data_internal(struct llama_context * ctx, llama_data
|
|
16219
16386
|
const uint32_t kv_size = kv_self.size;
|
16220
16387
|
const size_t kv_buf_size = kv_self.total_size() / (kv_size ? kv_size : 1) * kv_head;
|
16221
16388
|
const uint32_t kv_used = kv_self.used;
|
16389
|
+
const uint32_t v_trans = kv_self.v_trans ? 1 : 0;
|
16222
16390
|
|
16223
16391
|
data_ctx->write(&kv_buf_size, sizeof(kv_buf_size));
|
16224
16392
|
data_ctx->write(&kv_head, sizeof(kv_head));
|
16225
16393
|
data_ctx->write(&kv_size, sizeof(kv_size));
|
16226
16394
|
data_ctx->write(&kv_used, sizeof(kv_used));
|
16395
|
+
data_ctx->write(&v_trans, sizeof(v_trans));
|
16227
16396
|
|
16228
16397
|
if (kv_buf_size) {
|
16229
16398
|
const size_t pre_kv_buf_size = data_ctx->get_size_written();
|
@@ -16236,7 +16405,7 @@ static void llama_state_get_data_internal(struct llama_context * ctx, llama_data
|
|
16236
16405
|
ggml_backend_tensor_get(kv_self.k_l[il], tmp_buf.data(), 0, tmp_buf.size());
|
16237
16406
|
data_ctx->write(tmp_buf.data(), tmp_buf.size());
|
16238
16407
|
|
16239
|
-
if (kv_self.recurrent) {
|
16408
|
+
if (kv_self.recurrent || !kv_self.v_trans) {
|
16240
16409
|
// v is contiguous for recurrent models
|
16241
16410
|
// TODO: use other tensors for state models than k and v
|
16242
16411
|
const size_t v_size = ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa*kv_head);
|
@@ -16369,11 +16538,15 @@ size_t llama_state_set_data(struct llama_context * ctx, const uint8_t * src) {
|
|
16369
16538
|
uint32_t kv_head;
|
16370
16539
|
uint32_t kv_size;
|
16371
16540
|
uint32_t kv_used;
|
16541
|
+
uint32_t v_trans;
|
16372
16542
|
|
16373
16543
|
memcpy(&kv_buf_size, inp, sizeof(kv_buf_size)); inp += sizeof(kv_buf_size);
|
16374
16544
|
memcpy(&kv_head, inp, sizeof(kv_head)); inp += sizeof(kv_head);
|
16375
16545
|
memcpy(&kv_size, inp, sizeof(kv_size)); inp += sizeof(kv_size);
|
16376
16546
|
memcpy(&kv_used, inp, sizeof(kv_used)); inp += sizeof(kv_used);
|
16547
|
+
memcpy(&v_trans, inp, sizeof(v_trans)); inp += sizeof(v_trans);
|
16548
|
+
|
16549
|
+
GGML_ASSERT(kv_self.v_trans == (bool) v_trans); // incompatible V transposition
|
16377
16550
|
|
16378
16551
|
if (kv_self.size != kv_size) {
|
16379
16552
|
// the KV cache needs to be big enough to load all the KV cells from the saved state
|
@@ -16383,6 +16556,8 @@ size_t llama_state_set_data(struct llama_context * ctx, const uint8_t * src) {
|
|
16383
16556
|
__func__, kv_head, kv_size, kv_self.size);
|
16384
16557
|
}
|
16385
16558
|
|
16559
|
+
llama_kv_cache_clear(ctx);
|
16560
|
+
|
16386
16561
|
if (kv_buf_size) {
|
16387
16562
|
const size_t pre_kv_buf_size = inp - src;
|
16388
16563
|
|
@@ -16394,7 +16569,7 @@ size_t llama_state_set_data(struct llama_context * ctx, const uint8_t * src) {
|
|
16394
16569
|
ggml_backend_tensor_set(kv_self.k_l[il], inp, 0, k_size);
|
16395
16570
|
inp += k_size;
|
16396
16571
|
|
16397
|
-
if (kv_self.recurrent) {
|
16572
|
+
if (kv_self.recurrent || !kv_self.v_trans) {
|
16398
16573
|
// v is contiguous for recurrent models
|
16399
16574
|
// TODO: use other tensors for state models than k and v
|
16400
16575
|
const size_t v_size = ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa*kv_head);
|
@@ -16416,8 +16591,6 @@ size_t llama_state_set_data(struct llama_context * ctx, const uint8_t * src) {
|
|
16416
16591
|
GGML_ASSERT(kv_buf_size == inp - src - pre_kv_buf_size);
|
16417
16592
|
}
|
16418
16593
|
|
16419
|
-
llama_kv_cache_clear(ctx);
|
16420
|
-
|
16421
16594
|
ctx->kv_self.head = kv_head;
|
16422
16595
|
ctx->kv_self.used = kv_used;
|
16423
16596
|
|
@@ -16677,28 +16850,49 @@ static size_t llama_state_seq_get_data_internal(struct llama_context * ctx, llam
|
|
16677
16850
|
}
|
16678
16851
|
}
|
16679
16852
|
|
16680
|
-
//
|
16681
|
-
|
16682
|
-
|
16683
|
-
|
16684
|
-
|
16685
|
-
|
16853
|
+
// TODO: simplify, reduce copy-paste
|
16854
|
+
if (!kv_self.v_trans) {
|
16855
|
+
for (int il = 0; il < (int)n_layer; ++il) {
|
16856
|
+
// Write value type
|
16857
|
+
const int32_t v_type_i = (int32_t)kv_self.v_l[il]->type;
|
16858
|
+
data_ctx.write(&v_type_i, sizeof(v_type_i));
|
16686
16859
|
|
16687
|
-
|
16688
|
-
|
16689
|
-
|
16860
|
+
// Write row size of value
|
16861
|
+
const size_t v_size_row = ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa);
|
16862
|
+
data_ctx.write(&v_size_row, sizeof(v_size_row));
|
16690
16863
|
|
16691
|
-
|
16692
|
-
for (uint32_t j = 0; j < n_embd_v_gqa; ++j) {
|
16693
|
-
// Read each range of cells of v_size_el length each into tmp_buf and write out
|
16864
|
+
// Read each range of cells of v_size length each into tmp_buf and write out
|
16694
16865
|
for (const auto & range : cell_ranges) {
|
16695
16866
|
const size_t range_size = range.second - range.first;
|
16696
|
-
|
16697
|
-
tmp_buf.
|
16698
|
-
ggml_backend_tensor_get(kv_self.v_l[il], tmp_buf.data(), src_offset, tmp_buf.size());
|
16867
|
+
tmp_buf.resize(range_size * v_size_row);
|
16868
|
+
ggml_backend_tensor_get(kv_self.v_l[il], tmp_buf.data(), range.first * v_size_row, range_size * v_size_row);
|
16699
16869
|
data_ctx.write(tmp_buf.data(), tmp_buf.size());
|
16700
16870
|
}
|
16701
16871
|
}
|
16872
|
+
} else {
|
16873
|
+
// For the values, they are transposed, so we also need the element size and get the element ranges from each row
|
16874
|
+
const uint32_t kv_size = kv_self.size;
|
16875
|
+
for (int il = 0; il < (int)n_layer; ++il) {
|
16876
|
+
// Write value type
|
16877
|
+
const int32_t v_type_i = (int32_t)kv_self.v_l[il]->type;
|
16878
|
+
data_ctx.write(&v_type_i, sizeof(v_type_i));
|
16879
|
+
|
16880
|
+
// Write element size
|
16881
|
+
const size_t v_size_el = ggml_type_size(kv_self.v_l[il]->type);
|
16882
|
+
data_ctx.write(&v_size_el, sizeof(v_size_el));
|
16883
|
+
|
16884
|
+
// For each row, we get the element values of each cell
|
16885
|
+
for (uint32_t j = 0; j < n_embd_v_gqa; ++j) {
|
16886
|
+
// Read each range of cells of v_size_el length each into tmp_buf and write out
|
16887
|
+
for (const auto & range : cell_ranges) {
|
16888
|
+
const size_t range_size = range.second - range.first;
|
16889
|
+
const size_t src_offset = (range.first + j * kv_size) * v_size_el;
|
16890
|
+
tmp_buf.resize(range_size * v_size_el);
|
16891
|
+
ggml_backend_tensor_get(kv_self.v_l[il], tmp_buf.data(), src_offset, tmp_buf.size());
|
16892
|
+
data_ctx.write(tmp_buf.data(), tmp_buf.size());
|
16893
|
+
}
|
16894
|
+
}
|
16895
|
+
}
|
16702
16896
|
}
|
16703
16897
|
|
16704
16898
|
return data_ctx.get_size_written();
|
@@ -16823,41 +17017,75 @@ size_t llama_state_seq_set_data(struct llama_context * ctx, const uint8_t * src,
|
|
16823
17017
|
}
|
16824
17018
|
}
|
16825
17019
|
|
16826
|
-
//
|
16827
|
-
|
16828
|
-
|
16829
|
-
|
16830
|
-
|
16831
|
-
|
16832
|
-
|
16833
|
-
|
16834
|
-
|
16835
|
-
|
16836
|
-
|
16837
|
-
|
17020
|
+
// TODO: simplify, reduce copy-paste
|
17021
|
+
if (!kv_self.v_trans) {
|
17022
|
+
for (int il = 0; il < (int)n_layer; ++il) {
|
17023
|
+
// Read type of value
|
17024
|
+
int32_t v_type_i_ref;
|
17025
|
+
memcpy(&v_type_i_ref, inp, sizeof(v_type_i_ref));
|
17026
|
+
inp += sizeof(v_type_i_ref);
|
17027
|
+
const int32_t v_type_i = (int32_t)kv_self.v_l[il]->type;
|
17028
|
+
if (v_type_i != v_type_i_ref) {
|
17029
|
+
llama_kv_cache_seq_rm(kv_self, dest_seq_id, -1, -1);
|
17030
|
+
LLAMA_LOG_ERROR("%s: mismatched value type (%d != %d, layer %d)\n", __func__, v_type_i, v_type_i_ref, il);
|
17031
|
+
return 0;
|
17032
|
+
}
|
16838
17033
|
|
16839
|
-
|
16840
|
-
|
16841
|
-
|
16842
|
-
|
16843
|
-
|
16844
|
-
|
16845
|
-
|
16846
|
-
|
16847
|
-
|
16848
|
-
|
17034
|
+
// Read row size of value
|
17035
|
+
size_t v_size_row_ref;
|
17036
|
+
memcpy(&v_size_row_ref, inp, sizeof(v_size_row_ref));
|
17037
|
+
inp += sizeof(v_size_row_ref);
|
17038
|
+
const size_t v_size_row = ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa);
|
17039
|
+
if (v_size_row != v_size_row_ref) {
|
17040
|
+
llama_kv_cache_seq_rm(kv_self, dest_seq_id, -1, -1);
|
17041
|
+
LLAMA_LOG_ERROR("%s: mismatched value row size (%zu != %zu, layer %d)\n", __func__, v_size_row, v_size_row_ref, il);
|
17042
|
+
return 0;
|
17043
|
+
}
|
16849
17044
|
|
16850
|
-
|
16851
|
-
|
16852
|
-
|
16853
|
-
|
16854
|
-
|
16855
|
-
|
17045
|
+
if (cell_count) {
|
17046
|
+
// Read and set the values for the whole cell range
|
17047
|
+
ggml_backend_tensor_set(kv_self.v_l[il], inp, kv_head * v_size_row, cell_count * v_size_row);
|
17048
|
+
inp += cell_count * v_size_row;
|
17049
|
+
}
|
17050
|
+
}
|
17051
|
+
} else {
|
17052
|
+
// For each layer, read the values for each cell (transposed)
|
17053
|
+
for (int il = 0; il < (int)n_layer; ++il) {
|
17054
|
+
// Read type of value
|
17055
|
+
int32_t v_type_i_ref;
|
17056
|
+
memcpy(&v_type_i_ref, inp, sizeof(v_type_i_ref));
|
17057
|
+
inp += sizeof(v_type_i_ref);
|
17058
|
+
const int32_t v_type_i = (int32_t)kv_self.v_l[il]->type;
|
17059
|
+
if (v_type_i != v_type_i_ref) {
|
17060
|
+
llama_kv_cache_seq_rm(kv_self, dest_seq_id, -1, -1);
|
17061
|
+
LLAMA_LOG_ERROR("%s: mismatched value type (%d != %d, layer %d)\n", __func__, v_type_i, v_type_i_ref, il);
|
17062
|
+
return 0;
|
17063
|
+
}
|
17064
|
+
|
17065
|
+
// Read element size of value
|
17066
|
+
size_t v_size_el_ref;
|
17067
|
+
memcpy(&v_size_el_ref, inp, sizeof(v_size_el_ref));
|
17068
|
+
inp += sizeof(v_size_el_ref);
|
17069
|
+
const size_t v_size_el = ggml_type_size(kv_self.v_l[il]->type);
|
17070
|
+
if (v_size_el != v_size_el_ref) {
|
17071
|
+
llama_kv_cache_seq_rm(kv_self, dest_seq_id, -1, -1);
|
17072
|
+
LLAMA_LOG_ERROR("%s: mismatched value element size (%zu != %zu, layer %d)\n", __func__, v_size_el, v_size_el_ref, il);
|
17073
|
+
return 0;
|
17074
|
+
}
|
17075
|
+
|
17076
|
+
if (cell_count) {
|
17077
|
+
// For each row in the transposed matrix, read the values for the whole cell range
|
17078
|
+
for (uint32_t j = 0; j < n_embd_v_gqa; ++j) {
|
17079
|
+
const size_t dst_offset = (kv_head + j * kv_size) * v_size_el;
|
17080
|
+
ggml_backend_tensor_set(kv_self.v_l[il], inp, dst_offset, cell_count * v_size_el);
|
17081
|
+
inp += cell_count * v_size_el;
|
17082
|
+
}
|
16856
17083
|
}
|
16857
17084
|
}
|
16858
17085
|
}
|
16859
17086
|
|
16860
17087
|
const size_t nread = inp - src;
|
17088
|
+
|
16861
17089
|
return nread;
|
16862
17090
|
}
|
16863
17091
|
|
@@ -17654,9 +17882,9 @@ const char * llama_print_system_info(void) {
|
|
17654
17882
|
s += "VSX = " + std::to_string(ggml_cpu_has_vsx()) + " | ";
|
17655
17883
|
s += "MATMUL_INT8 = " + std::to_string(ggml_cpu_has_matmul_int8()) + " | ";
|
17656
17884
|
#ifdef GGML_USE_LLAMAFILE
|
17657
|
-
s += "
|
17885
|
+
s += "LLAMAFILE = 1 | ";
|
17658
17886
|
#else
|
17659
|
-
s += "
|
17887
|
+
s += "LLAMAFILE = 0 | ";
|
17660
17888
|
#endif
|
17661
17889
|
|
17662
17890
|
return s.c_str();
|