llama_cpp 0.14.7 → 0.15.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +13 -0
- data/README.md +2 -2
- data/ext/llama_cpp/extconf.rb +2 -1
- data/ext/llama_cpp/llama_cpp.cpp +53 -9
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +18 -3
- data/vendor/tmp/llama.cpp/Makefile +41 -16
- data/vendor/tmp/llama.cpp/ggml-backend.c +7 -5
- data/vendor/tmp/llama.cpp/ggml-cuda.cu +6 -0
- data/vendor/tmp/llama.cpp/ggml-impl.h +1 -1
- data/vendor/tmp/llama.cpp/ggml-kompute.cpp +7 -0
- data/vendor/tmp/llama.cpp/ggml-metal.m +376 -176
- data/vendor/tmp/llama.cpp/ggml-metal.metal +654 -18
- data/vendor/tmp/llama.cpp/ggml-quants.c +284 -0
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +17 -7
- data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +5 -0
- data/vendor/tmp/llama.cpp/ggml.c +391 -27
- data/vendor/tmp/llama.cpp/ggml.h +22 -0
- data/vendor/tmp/llama.cpp/llama.cpp +623 -395
- data/vendor/tmp/llama.cpp/llama.h +27 -9
- data/vendor/tmp/llama.cpp/sgemm.cpp +83 -87
- data/vendor/tmp/llama.cpp/sgemm.h +4 -2
- data/vendor/tmp/llama.cpp/unicode-data.cpp +1 -1
- data/vendor/tmp/llama.cpp/unicode-data.h +2 -2
- data/vendor/tmp/llama.cpp/unicode.cpp +448 -39
- data/vendor/tmp/llama.cpp/unicode.h +2 -1
- metadata +3 -3
@@ -75,6 +75,7 @@
|
|
75
75
|
#include <forward_list>
|
76
76
|
#include <fstream>
|
77
77
|
#include <functional>
|
78
|
+
#include <future>
|
78
79
|
#include <initializer_list>
|
79
80
|
#include <locale>
|
80
81
|
#include <map>
|
@@ -107,7 +108,6 @@
|
|
107
108
|
#define LLAMA_MAX_NODES 8192
|
108
109
|
#define LLAMA_MAX_EXPERTS 60
|
109
110
|
|
110
|
-
|
111
111
|
//
|
112
112
|
// logging
|
113
113
|
//
|
@@ -316,6 +316,7 @@ enum llm_kv {
|
|
316
316
|
LLM_KV_SSM_TIME_STEP_RANK,
|
317
317
|
|
318
318
|
LLM_KV_TOKENIZER_MODEL,
|
319
|
+
LLM_KV_TOKENIZER_PRE,
|
319
320
|
LLM_KV_TOKENIZER_LIST,
|
320
321
|
LLM_KV_TOKENIZER_TOKEN_TYPE,
|
321
322
|
LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT,
|
@@ -392,6 +393,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
|
392
393
|
{ LLM_KV_SSM_TIME_STEP_RANK, "%s.ssm.time_step_rank" },
|
393
394
|
|
394
395
|
{ LLM_KV_TOKENIZER_MODEL, "tokenizer.ggml.model" },
|
396
|
+
{ LLM_KV_TOKENIZER_PRE, "tokenizer.ggml.pre" },
|
395
397
|
{ LLM_KV_TOKENIZER_LIST, "tokenizer.ggml.tokens" },
|
396
398
|
{ LLM_KV_TOKENIZER_TOKEN_TYPE, "tokenizer.ggml.token_type" },
|
397
399
|
{ LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT, "tokenizer.ggml.token_type_count" },
|
@@ -1843,7 +1845,7 @@ struct llama_hparams {
|
|
1843
1845
|
float f_logit_scale = 0.0f;
|
1844
1846
|
|
1845
1847
|
bool causal_attn = true;
|
1846
|
-
bool
|
1848
|
+
bool use_alibi = false; // currently, we need KQ_pos data for ALiBi-based models
|
1847
1849
|
|
1848
1850
|
enum llama_pooling_type pooling_type = LLAMA_POOLING_TYPE_NONE;
|
1849
1851
|
enum llama_rope_type rope_type = LLAMA_ROPE_TYPE_NONE;
|
@@ -1933,6 +1935,7 @@ struct llama_cparams {
|
|
1933
1935
|
bool embeddings;
|
1934
1936
|
bool causal_attn;
|
1935
1937
|
bool offload_kqv;
|
1938
|
+
bool flash_attn;
|
1936
1939
|
|
1937
1940
|
enum llama_pooling_type pooling_type;
|
1938
1941
|
|
@@ -2036,8 +2039,8 @@ struct llama_kv_cache {
|
|
2036
2039
|
bool has_shift = false;
|
2037
2040
|
bool do_defrag = false;
|
2038
2041
|
bool do_copy = false;
|
2039
|
-
// with recurrent state models, a cell can hold the state for more than one past token
|
2040
|
-
bool
|
2042
|
+
bool recurrent = false; // with recurrent state models, a cell can hold the state for more than one past token
|
2043
|
+
bool v_trans = true; // the value tensor is transposed
|
2041
2044
|
|
2042
2045
|
// Note: The value of head isn't only used to optimize searching
|
2043
2046
|
// for a free KV slot. llama_decode_internal also uses it, so it
|
@@ -2114,7 +2117,8 @@ struct llama_vocab {
|
|
2114
2117
|
ttype type;
|
2115
2118
|
};
|
2116
2119
|
|
2117
|
-
enum llama_vocab_type
|
2120
|
+
enum llama_vocab_type type = LLAMA_VOCAB_TYPE_SPM;
|
2121
|
+
enum llama_vocab_pre_type type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
|
2118
2122
|
|
2119
2123
|
std::unordered_map<token, id> token_to_id;
|
2120
2124
|
std::vector<token_data> id_to_token;
|
@@ -2335,11 +2339,14 @@ struct llama_context {
|
|
2335
2339
|
|
2336
2340
|
static bool llama_kv_cache_init(
|
2337
2341
|
struct llama_kv_cache & cache,
|
2338
|
-
|
2342
|
+
const llama_context * ctx,
|
2339
2343
|
ggml_type type_k,
|
2340
2344
|
ggml_type type_v,
|
2341
2345
|
uint32_t kv_size,
|
2342
2346
|
bool offload) {
|
2347
|
+
const llama_model & model = ctx->model;
|
2348
|
+
const llama_cparams & cparams = ctx->cparams;
|
2349
|
+
|
2343
2350
|
const struct llama_hparams & hparams = model.hparams;
|
2344
2351
|
|
2345
2352
|
const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa() + hparams.n_embd_k_s();
|
@@ -2350,8 +2357,9 @@ static bool llama_kv_cache_init(
|
|
2350
2357
|
|
2351
2358
|
// TODO: find a nicer way to add other recurrent model architectures
|
2352
2359
|
cache.recurrent = model.arch == LLM_ARCH_MAMBA;
|
2360
|
+
cache.v_trans = !cparams.flash_attn;
|
2353
2361
|
|
2354
|
-
// TODO: support mixed
|
2362
|
+
// TODO: support mixed recurrent Transformer architectures
|
2355
2363
|
// NOTE: (!a || b) is a logical implication (a -> b)
|
2356
2364
|
GGML_ASSERT(!cache.recurrent || n_embd_k_gqa == hparams.n_embd_k_s());
|
2357
2365
|
GGML_ASSERT(!cache.recurrent || n_embd_v_gqa == hparams.n_embd_v_s());
|
@@ -2562,6 +2570,10 @@ static void llama_kv_cache_clear(struct llama_kv_cache & cache) {
|
|
2562
2570
|
}
|
2563
2571
|
cache.head = 0;
|
2564
2572
|
cache.used = 0;
|
2573
|
+
|
2574
|
+
for (auto & buf : cache.bufs) {
|
2575
|
+
ggml_backend_buffer_clear(buf, 0);
|
2576
|
+
}
|
2565
2577
|
}
|
2566
2578
|
|
2567
2579
|
static bool llama_kv_cache_seq_rm(
|
@@ -2882,6 +2894,7 @@ namespace GGUFMeta {
|
|
2882
2894
|
case LLAMA_KV_OVERRIDE_TYPE_BOOL: return "bool";
|
2883
2895
|
case LLAMA_KV_OVERRIDE_TYPE_INT: return "int";
|
2884
2896
|
case LLAMA_KV_OVERRIDE_TYPE_FLOAT: return "float";
|
2897
|
+
case LLAMA_KV_OVERRIDE_TYPE_STR: return "str";
|
2885
2898
|
}
|
2886
2899
|
return "unknown";
|
2887
2900
|
}
|
@@ -2893,13 +2906,16 @@ namespace GGUFMeta {
|
|
2893
2906
|
__func__, override_type_to_str(ovrd->tag), ovrd->key);
|
2894
2907
|
switch (ovrd->tag) {
|
2895
2908
|
case LLAMA_KV_OVERRIDE_TYPE_BOOL: {
|
2896
|
-
LLAMA_LOG_INFO("%s\n", ovrd->
|
2909
|
+
LLAMA_LOG_INFO("%s\n", ovrd->val_bool ? "true" : "false");
|
2897
2910
|
} break;
|
2898
2911
|
case LLAMA_KV_OVERRIDE_TYPE_INT: {
|
2899
|
-
LLAMA_LOG_INFO("%" PRId64 "\n", ovrd->
|
2912
|
+
LLAMA_LOG_INFO("%" PRId64 "\n", ovrd->val_i64);
|
2900
2913
|
} break;
|
2901
2914
|
case LLAMA_KV_OVERRIDE_TYPE_FLOAT: {
|
2902
|
-
LLAMA_LOG_INFO("%.6f\n", ovrd->
|
2915
|
+
LLAMA_LOG_INFO("%.6f\n", ovrd->val_f64);
|
2916
|
+
} break;
|
2917
|
+
case LLAMA_KV_OVERRIDE_TYPE_STR: {
|
2918
|
+
LLAMA_LOG_INFO("%s\n", ovrd->val_str);
|
2903
2919
|
} break;
|
2904
2920
|
default:
|
2905
2921
|
// Shouldn't be possible to end up here, but just in case...
|
@@ -2918,7 +2934,7 @@ namespace GGUFMeta {
|
|
2918
2934
|
static typename std::enable_if<std::is_same<OT, bool>::value, bool>::type
|
2919
2935
|
try_override(OT & target, const struct llama_model_kv_override * ovrd) {
|
2920
2936
|
if (validate_override(LLAMA_KV_OVERRIDE_TYPE_BOOL, ovrd)) {
|
2921
|
-
target = ovrd->
|
2937
|
+
target = ovrd->val_bool;
|
2922
2938
|
return true;
|
2923
2939
|
}
|
2924
2940
|
return false;
|
@@ -2928,7 +2944,7 @@ namespace GGUFMeta {
|
|
2928
2944
|
static typename std::enable_if<!std::is_same<OT, bool>::value && std::is_integral<OT>::value, bool>::type
|
2929
2945
|
try_override(OT & target, const struct llama_model_kv_override * ovrd) {
|
2930
2946
|
if (validate_override(LLAMA_KV_OVERRIDE_TYPE_INT, ovrd)) {
|
2931
|
-
target = ovrd->
|
2947
|
+
target = ovrd->val_i64;
|
2932
2948
|
return true;
|
2933
2949
|
}
|
2934
2950
|
return false;
|
@@ -2938,7 +2954,7 @@ namespace GGUFMeta {
|
|
2938
2954
|
static typename std::enable_if<std::is_floating_point<OT>::value, bool>::type
|
2939
2955
|
try_override(T & target, const struct llama_model_kv_override * ovrd) {
|
2940
2956
|
if (validate_override(LLAMA_KV_OVERRIDE_TYPE_FLOAT, ovrd)) {
|
2941
|
-
target = ovrd->
|
2957
|
+
target = ovrd->val_f64;
|
2942
2958
|
return true;
|
2943
2959
|
}
|
2944
2960
|
return false;
|
@@ -2947,12 +2963,11 @@ namespace GGUFMeta {
|
|
2947
2963
|
template<typename OT>
|
2948
2964
|
static typename std::enable_if<std::is_same<OT, std::string>::value, bool>::type
|
2949
2965
|
try_override(T & target, const struct llama_model_kv_override * ovrd) {
|
2950
|
-
(
|
2951
|
-
|
2952
|
-
|
2953
|
-
|
2954
|
-
|
2955
|
-
ovrd ? ovrd->key : "NULL"));
|
2966
|
+
if (validate_override(LLAMA_KV_OVERRIDE_TYPE_STR, ovrd)) {
|
2967
|
+
target = ovrd->val_str;
|
2968
|
+
return true;
|
2969
|
+
}
|
2970
|
+
return false;
|
2956
2971
|
}
|
2957
2972
|
|
2958
2973
|
static bool set(const gguf_context * ctx, const int k, T & target, const struct llama_model_kv_override * ovrd = nullptr) {
|
@@ -2985,6 +3000,7 @@ struct llama_model_loader {
|
|
2985
3000
|
size_t n_bytes = 0;
|
2986
3001
|
|
2987
3002
|
bool use_mmap = false;
|
3003
|
+
bool check_tensors;
|
2988
3004
|
|
2989
3005
|
llama_files files;
|
2990
3006
|
llama_ftype ftype;
|
@@ -3018,7 +3034,7 @@ struct llama_model_loader {
|
|
3018
3034
|
std::string arch_name;
|
3019
3035
|
LLM_KV llm_kv = LLM_KV(LLM_ARCH_UNKNOWN);
|
3020
3036
|
|
3021
|
-
llama_model_loader(const std::string & fname, bool use_mmap, const struct llama_model_kv_override * param_overrides_p) {
|
3037
|
+
llama_model_loader(const std::string & fname, bool use_mmap, bool check_tensors, const struct llama_model_kv_override * param_overrides_p) {
|
3022
3038
|
int trace = 0;
|
3023
3039
|
if (getenv("LLAMA_TRACE")) {
|
3024
3040
|
trace = atoi(getenv("LLAMA_TRACE"));
|
@@ -3115,9 +3131,17 @@ struct llama_model_loader {
|
|
3115
3131
|
|
3116
3132
|
fver = (enum llama_fver) gguf_get_version(meta);
|
3117
3133
|
|
3134
|
+
std::set<std::string> tensor_names;
|
3118
3135
|
for (auto & w : weights) {
|
3119
3136
|
n_elements += ggml_nelements(w.tensor);
|
3120
3137
|
n_bytes += ggml_nbytes(w.tensor);
|
3138
|
+
// make sure there is no duplicated tensor names
|
3139
|
+
const std::string name(w.tensor->name);
|
3140
|
+
auto found = tensor_names.find(name);
|
3141
|
+
if (found != tensor_names.end()) {
|
3142
|
+
throw std::runtime_error(format("invalid model: tensor '%s' is duplicated", w.tensor->name));
|
3143
|
+
}
|
3144
|
+
tensor_names.insert(name);
|
3121
3145
|
}
|
3122
3146
|
|
3123
3147
|
LLAMA_LOG_INFO("%s: loaded meta data with %d key-value pairs and %d tensors from %s (version %s)\n",
|
@@ -3223,6 +3247,7 @@ struct llama_model_loader {
|
|
3223
3247
|
}
|
3224
3248
|
|
3225
3249
|
this->use_mmap = use_mmap;
|
3250
|
+
this->check_tensors = check_tensors;
|
3226
3251
|
}
|
3227
3252
|
|
3228
3253
|
~llama_model_loader() {
|
@@ -3481,6 +3506,10 @@ struct llama_model_loader {
|
|
3481
3506
|
file->seek(w.offs, SEEK_SET);
|
3482
3507
|
file->read_raw(cur->data, ggml_nbytes(cur));
|
3483
3508
|
}
|
3509
|
+
|
3510
|
+
if (check_tensors && !ggml_validate_row_data(cur->type, cur->data, ggml_nbytes(cur))) {
|
3511
|
+
throw std::runtime_error(format("tensor '%s' has invalid data", ggml_get_name(cur)));
|
3512
|
+
}
|
3484
3513
|
}
|
3485
3514
|
|
3486
3515
|
size_t size_done = 0;
|
@@ -3497,6 +3526,8 @@ struct llama_model_loader {
|
|
3497
3526
|
GGML_ASSERT(size_data != 0 && "call init_mappings() first");
|
3498
3527
|
|
3499
3528
|
std::vector<no_init<uint8_t>> read_buf;
|
3529
|
+
std::vector<std::future<std::pair<ggml_tensor *, bool>>> validation_result;
|
3530
|
+
|
3500
3531
|
for (struct ggml_tensor * cur = ggml_get_first_tensor(ctx); cur != NULL; cur = ggml_get_next_tensor(ctx, cur)) {
|
3501
3532
|
const auto * weight = get_weight(ggml_get_name(cur));
|
3502
3533
|
if (weight == nullptr) {
|
@@ -3518,37 +3549,66 @@ struct llama_model_loader {
|
|
3518
3549
|
if (bufs_mmap.count(weight->idx)) {
|
3519
3550
|
buf_mmap = bufs_mmap.at(weight->idx);
|
3520
3551
|
}
|
3552
|
+
uint8_t * data = (uint8_t *) mapping->addr + weight->offs;
|
3553
|
+
|
3554
|
+
if (check_tensors) {
|
3555
|
+
validation_result.emplace_back(std::async(std::launch::async, [cur, data, n_size] {
|
3556
|
+
return std::make_pair(cur, ggml_validate_row_data(cur->type, data, n_size));
|
3557
|
+
}));
|
3558
|
+
}
|
3559
|
+
|
3521
3560
|
GGML_ASSERT(buf_mmap || cur->data); // either we have a buffer to allocate the tensor in, or it is already allocated
|
3522
3561
|
if (buf_mmap && cur->data == nullptr) {
|
3523
|
-
ggml_backend_tensor_alloc(buf_mmap, cur,
|
3562
|
+
ggml_backend_tensor_alloc(buf_mmap, cur, data);
|
3524
3563
|
if (lmlocks) {
|
3525
3564
|
const auto & lmlock = lmlocks->at(weight->idx);
|
3526
|
-
lmlock->grow_to(weight->offs +
|
3565
|
+
lmlock->grow_to(weight->offs + n_size);
|
3527
3566
|
}
|
3528
3567
|
|
3529
3568
|
auto & mmap_used = mmaps_used[weight->idx];
|
3530
3569
|
mmap_used.first = std::min(mmap_used.first, weight->offs);
|
3531
3570
|
mmap_used.second = std::max(mmap_used.second, weight->offs + n_size);
|
3532
3571
|
} else {
|
3533
|
-
ggml_backend_tensor_set(cur,
|
3572
|
+
ggml_backend_tensor_set(cur, data, 0, n_size);
|
3534
3573
|
}
|
3535
3574
|
} else {
|
3536
3575
|
GGML_ASSERT(weight->idx < files.size());
|
3537
3576
|
const auto & file = files.at(weight->idx);
|
3538
3577
|
if (ggml_backend_buffer_is_host(cur->buffer)) {
|
3539
3578
|
file->seek(weight->offs, SEEK_SET);
|
3540
|
-
file->read_raw(cur->data,
|
3579
|
+
file->read_raw(cur->data, n_size);
|
3580
|
+
if (check_tensors) {
|
3581
|
+
validation_result.emplace_back(std::async(std::launch::async, [cur, n_size] {
|
3582
|
+
return std::make_pair(cur, ggml_validate_row_data(cur->type, cur->data, n_size));
|
3583
|
+
}));
|
3584
|
+
}
|
3541
3585
|
} else {
|
3542
|
-
read_buf.resize(
|
3586
|
+
read_buf.resize(n_size);
|
3543
3587
|
file->seek(weight->offs, SEEK_SET);
|
3544
|
-
file->read_raw(read_buf.data(),
|
3588
|
+
file->read_raw(read_buf.data(), n_size);
|
3545
3589
|
ggml_backend_tensor_set(cur, read_buf.data(), 0, n_size);
|
3590
|
+
if (check_tensors && !ggml_validate_row_data(cur->type, read_buf.data(), n_size)) {
|
3591
|
+
throw std::runtime_error(format("tensor '%s' has invalid data", ggml_get_name(cur)));
|
3592
|
+
}
|
3546
3593
|
}
|
3547
3594
|
}
|
3548
3595
|
|
3549
3596
|
size_done += n_size;
|
3550
3597
|
}
|
3551
3598
|
|
3599
|
+
// check validation results
|
3600
|
+
bool validation_failed = false;
|
3601
|
+
for (auto & future : validation_result) {
|
3602
|
+
auto result = future.get();
|
3603
|
+
if (!result.second) {
|
3604
|
+
LLAMA_LOG_ERROR("%s: tensor '%s' has invalid data\n", __func__, ggml_get_name(result.first));
|
3605
|
+
validation_failed = true;
|
3606
|
+
}
|
3607
|
+
}
|
3608
|
+
if (validation_failed) {
|
3609
|
+
throw std::runtime_error("found tensors with invalid data");
|
3610
|
+
}
|
3611
|
+
|
3552
3612
|
// check if this is the last call and do final cleanup
|
3553
3613
|
if (size_done >= size_data) {
|
3554
3614
|
// unmap offloaded tensors and metadata
|
@@ -4142,7 +4202,7 @@ static void llm_load_hparams(
|
|
4142
4202
|
model.ftype = ml.ftype;
|
4143
4203
|
|
4144
4204
|
if (hparams.f_max_alibi_bias > 0.0f) {
|
4145
|
-
hparams.
|
4205
|
+
hparams.use_alibi = true;
|
4146
4206
|
}
|
4147
4207
|
|
4148
4208
|
hparams.rope_type = llama_rope_type(&model);
|
@@ -4165,11 +4225,13 @@ static void llm_load_vocab(
|
|
4165
4225
|
|
4166
4226
|
// determine vocab type
|
4167
4227
|
{
|
4168
|
-
std::string
|
4228
|
+
std::string tokenizer_model;
|
4229
|
+
std::string tokenizer_pre;
|
4169
4230
|
|
4170
|
-
ml.get_key(LLM_KV_TOKENIZER_MODEL,
|
4231
|
+
ml.get_key(LLM_KV_TOKENIZER_MODEL, tokenizer_model);
|
4232
|
+
ml.get_key(LLM_KV_TOKENIZER_PRE, tokenizer_pre, false);
|
4171
4233
|
|
4172
|
-
if (
|
4234
|
+
if (tokenizer_model == "no_vocab") {
|
4173
4235
|
vocab.type = LLAMA_VOCAB_TYPE_NONE;
|
4174
4236
|
|
4175
4237
|
// default special tokens
|
@@ -4183,7 +4245,7 @@ static void llm_load_vocab(
|
|
4183
4245
|
vocab.linefeed_id = -1;
|
4184
4246
|
|
4185
4247
|
return;
|
4186
|
-
} else if (
|
4248
|
+
} else if (tokenizer_model == "llama") {
|
4187
4249
|
vocab.type = LLAMA_VOCAB_TYPE_SPM;
|
4188
4250
|
|
4189
4251
|
// default special tokens
|
@@ -4228,9 +4290,27 @@ static void llm_load_vocab(
|
|
4228
4290
|
if (add_space_prefix_keyidx != -1) {
|
4229
4291
|
vocab.add_space_prefix = gguf_get_val_bool(ctx, add_space_prefix_keyidx);
|
4230
4292
|
} // The default value of add_space_prefix is true.
|
4231
|
-
} else if (
|
4232
|
-
vocab.type =
|
4293
|
+
} else if (tokenizer_model == "bert") {
|
4294
|
+
vocab.type = LLAMA_VOCAB_TYPE_WPM;
|
4233
4295
|
|
4296
|
+
// default special tokens
|
4297
|
+
vocab.special_bos_id = -1;
|
4298
|
+
vocab.special_eos_id = -1;
|
4299
|
+
vocab.special_unk_id = 100;
|
4300
|
+
vocab.special_sep_id = 102;
|
4301
|
+
vocab.special_pad_id = 0;
|
4302
|
+
vocab.special_cls_id = 101;
|
4303
|
+
vocab.special_mask_id = 103;
|
4304
|
+
vocab.add_space_prefix = false;
|
4305
|
+
} else {
|
4306
|
+
if (tokenizer_model == "gpt2") {
|
4307
|
+
vocab.type = LLAMA_VOCAB_TYPE_BPE;
|
4308
|
+
} else {
|
4309
|
+
LLAMA_LOG_WARN("%s: unknown tokenizer: '%s'", __func__, tokenizer_model.c_str());
|
4310
|
+
LLAMA_LOG_WARN("%s: using default tokenizer: 'llama'", __func__);
|
4311
|
+
vocab.type = LLAMA_VOCAB_TYPE_SPM;
|
4312
|
+
return;
|
4313
|
+
}
|
4234
4314
|
// read bpe merges and populate bpe ranks
|
4235
4315
|
const int merges_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_MERGES).c_str());
|
4236
4316
|
if (merges_keyidx == -1) {
|
@@ -4264,23 +4344,50 @@ static void llm_load_vocab(
|
|
4264
4344
|
vocab.special_pad_id = -1;
|
4265
4345
|
vocab.special_cls_id = -1;
|
4266
4346
|
vocab.special_mask_id = -1;
|
4267
|
-
}
|
4268
|
-
vocab.type = LLAMA_VOCAB_TYPE_WPM;
|
4347
|
+
}
|
4269
4348
|
|
4270
|
-
|
4271
|
-
|
4272
|
-
|
4273
|
-
|
4274
|
-
|
4275
|
-
|
4276
|
-
|
4277
|
-
|
4278
|
-
|
4349
|
+
// for now, only BPE models have pre-tokenizers
|
4350
|
+
if (vocab.type == LLAMA_VOCAB_TYPE_BPE) {
|
4351
|
+
if (tokenizer_pre.empty()) {
|
4352
|
+
LLAMA_LOG_WARN("%s: missing pre-tokenizer type, using: 'default'\n", __func__);
|
4353
|
+
LLAMA_LOG_WARN("%s: \n", __func__);
|
4354
|
+
LLAMA_LOG_WARN("%s: ************************************ \n", __func__);
|
4355
|
+
LLAMA_LOG_WARN("%s: GENERATION QUALITY WILL BE DEGRADED! \n", __func__);
|
4356
|
+
LLAMA_LOG_WARN("%s: CONSIDER REGENERATING THE MODEL \n", __func__);
|
4357
|
+
LLAMA_LOG_WARN("%s: ************************************ \n", __func__);
|
4358
|
+
LLAMA_LOG_WARN("%s: \n", __func__);
|
4359
|
+
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
|
4360
|
+
} else if (
|
4361
|
+
tokenizer_pre == "default") {
|
4362
|
+
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
|
4363
|
+
} else if (
|
4364
|
+
tokenizer_pre == "llama3" ||
|
4365
|
+
tokenizer_pre == "llama-v3" ||
|
4366
|
+
tokenizer_pre == "llama-bpe") {
|
4367
|
+
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_LLAMA3;
|
4368
|
+
} else if (
|
4369
|
+
tokenizer_pre == "deepseek-llm") {
|
4370
|
+
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM;
|
4371
|
+
} else if (
|
4372
|
+
tokenizer_pre == "deepseek-coder") {
|
4373
|
+
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER;
|
4374
|
+
} else if (
|
4375
|
+
tokenizer_pre == "falcon") {
|
4376
|
+
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_FALCON;
|
4377
|
+
} else if (
|
4378
|
+
tokenizer_pre == "mpt") {
|
4379
|
+
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_MPT;
|
4380
|
+
} else if (
|
4381
|
+
tokenizer_pre == "starcoder") {
|
4382
|
+
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_STARCODER;
|
4383
|
+
} else if (
|
4384
|
+
tokenizer_pre == "gpt-2") {
|
4385
|
+
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_GPT2;
|
4386
|
+
} else {
|
4387
|
+
throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
|
4388
|
+
}
|
4279
4389
|
} else {
|
4280
|
-
|
4281
|
-
LLAMA_LOG_WARN("%s: using default tokenizer: 'llama'", __func__);
|
4282
|
-
|
4283
|
-
vocab.type = LLAMA_VOCAB_TYPE_SPM;
|
4390
|
+
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
|
4284
4391
|
}
|
4285
4392
|
}
|
4286
4393
|
|
@@ -5975,7 +6082,7 @@ static bool llm_load_tensors(
|
|
5975
6082
|
// Returns 0 on success, -1 on error, and -2 on cancellation via llama_progress_callback
|
5976
6083
|
static int llama_model_load(const std::string & fname, llama_model & model, llama_model_params & params) {
|
5977
6084
|
try {
|
5978
|
-
llama_model_loader ml(fname, params.use_mmap, params.kv_overrides);
|
6085
|
+
llama_model_loader ml(fname, params.use_mmap, params.check_tensors, params.kv_overrides);
|
5979
6086
|
|
5980
6087
|
model.hparams.vocab_only = params.vocab_only;
|
5981
6088
|
|
@@ -6104,37 +6211,47 @@ static struct ggml_tensor * llm_build_inp_embd(
|
|
6104
6211
|
static void llm_build_kv_store(
|
6105
6212
|
struct ggml_context * ctx,
|
6106
6213
|
const llama_hparams & hparams,
|
6214
|
+
const llama_cparams & cparams,
|
6107
6215
|
const llama_kv_cache & kv,
|
6108
6216
|
struct ggml_cgraph * graph,
|
6109
6217
|
struct ggml_tensor * k_cur,
|
6110
6218
|
struct ggml_tensor * v_cur,
|
6111
|
-
int64_t n_ctx,
|
6112
6219
|
int32_t n_tokens,
|
6113
6220
|
int32_t kv_head,
|
6114
6221
|
const llm_build_cb & cb,
|
6115
6222
|
int64_t il) {
|
6223
|
+
const int64_t n_ctx = cparams.n_ctx;
|
6224
|
+
|
6116
6225
|
const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa();
|
6117
6226
|
const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa();
|
6118
6227
|
|
6119
6228
|
GGML_ASSERT(kv.size == n_ctx);
|
6120
6229
|
|
6121
|
-
// compute the transposed [n_tokens, n_embd] V matrix
|
6122
|
-
assert(v_cur->ne[0] == n_embd_v_gqa && v_cur->ne[1] == n_tokens);
|
6123
|
-
struct ggml_tensor * v_cur_t = ggml_transpose(ctx, v_cur);
|
6124
|
-
cb(v_cur_t, "v_cur_t", il);
|
6125
|
-
|
6126
6230
|
struct ggml_tensor * k_cache_view = ggml_view_1d(ctx, kv.k_l[il], n_tokens*n_embd_k_gqa,
|
6127
6231
|
(ggml_row_size(kv.k_l[il]->type, n_embd_k_gqa))*kv_head);
|
6128
6232
|
cb(k_cache_view, "k_cache_view", il);
|
6129
6233
|
|
6130
|
-
|
6131
|
-
|
6132
|
-
|
6234
|
+
// note: storing RoPE-ed version of K in the KV cache
|
6235
|
+
ggml_build_forward_expand(graph, ggml_cpy(ctx, k_cur, k_cache_view));
|
6236
|
+
|
6237
|
+
assert(v_cur->ne[0] == n_embd_v_gqa && v_cur->ne[1] == n_tokens);
|
6238
|
+
|
6239
|
+
struct ggml_tensor * v_cache_view = nullptr;
|
6240
|
+
|
6241
|
+
if (cparams.flash_attn) {
|
6242
|
+
v_cache_view = ggml_view_1d(ctx, kv.v_l[il], n_tokens*n_embd_v_gqa,
|
6243
|
+
(kv_head)*ggml_row_size(kv.v_l[il]->type, n_embd_v_gqa));
|
6244
|
+
} else {
|
6245
|
+
// note: the V cache is transposed when not using flash attention
|
6246
|
+
v_cache_view = ggml_view_2d(ctx, kv.v_l[il], n_tokens, n_embd_v_gqa,
|
6247
|
+
( n_ctx)*ggml_element_size(kv.v_l[il]),
|
6248
|
+
(kv_head)*ggml_element_size(kv.v_l[il]));
|
6249
|
+
|
6250
|
+
v_cur = ggml_transpose(ctx, v_cur);
|
6251
|
+
}
|
6133
6252
|
cb(v_cache_view, "v_cache_view", il);
|
6134
6253
|
|
6135
|
-
|
6136
|
-
ggml_build_forward_expand(graph, ggml_cpy(ctx, k_cur, k_cache_view));
|
6137
|
-
ggml_build_forward_expand(graph, ggml_cpy(ctx, v_cur_t, v_cache_view));
|
6254
|
+
ggml_build_forward_expand(graph, ggml_cpy(ctx, v_cur, v_cache_view));
|
6138
6255
|
}
|
6139
6256
|
|
6140
6257
|
static struct ggml_tensor * llm_build_norm(
|
@@ -6354,11 +6471,11 @@ static struct ggml_tensor * llm_build_moe_ffn(
|
|
6354
6471
|
return moe_out;
|
6355
6472
|
}
|
6356
6473
|
|
6357
|
-
// if max_alibi_bias > 0 then apply ALiBi
|
6358
6474
|
static struct ggml_tensor * llm_build_kqv(
|
6359
6475
|
struct ggml_context * ctx,
|
6360
6476
|
const llama_model & model,
|
6361
6477
|
const llama_hparams & hparams,
|
6478
|
+
const llama_cparams & cparams,
|
6362
6479
|
const llama_kv_cache & kv,
|
6363
6480
|
struct ggml_cgraph * graph,
|
6364
6481
|
struct ggml_tensor * wo,
|
@@ -6366,12 +6483,12 @@ static struct ggml_tensor * llm_build_kqv(
|
|
6366
6483
|
struct ggml_tensor * q_cur,
|
6367
6484
|
struct ggml_tensor * kq_mask,
|
6368
6485
|
struct ggml_tensor * kq_pos,
|
6369
|
-
int64_t n_ctx,
|
6370
6486
|
int32_t n_tokens,
|
6371
6487
|
int32_t n_kv,
|
6372
6488
|
float kq_scale,
|
6373
6489
|
const llm_build_cb & cb,
|
6374
6490
|
int il) {
|
6491
|
+
const int64_t n_ctx = cparams.n_ctx;
|
6375
6492
|
const int64_t n_head = hparams.n_head;
|
6376
6493
|
const int64_t n_head_kv = hparams.n_head_kv;
|
6377
6494
|
const int64_t n_embd_head_k = hparams.n_embd_head_k;
|
@@ -6389,71 +6506,99 @@ static struct ggml_tensor * llm_build_kqv(
|
|
6389
6506
|
0);
|
6390
6507
|
cb(k, "k", il);
|
6391
6508
|
|
6392
|
-
struct ggml_tensor *
|
6393
|
-
cb(kq, "kq", il);
|
6509
|
+
struct ggml_tensor * cur;
|
6394
6510
|
|
6395
|
-
if (
|
6396
|
-
|
6397
|
-
|
6398
|
-
ggml_mul_mat_set_prec(kq, GGML_PREC_F32);
|
6399
|
-
}
|
6511
|
+
if (cparams.flash_attn) {
|
6512
|
+
GGML_UNUSED(model);
|
6513
|
+
GGML_UNUSED(n_ctx);
|
6400
6514
|
|
6401
|
-
|
6402
|
-
//
|
6403
|
-
|
6404
|
-
// and then :
|
6405
|
-
// kq = 30 * tanh(kq / 30)
|
6406
|
-
// before the softmax below
|
6515
|
+
// note: if this assert triggers, then some check has failed earlier
|
6516
|
+
// the idea is to detect during context creation that ALiBi would be used and disable Flash Attention
|
6517
|
+
GGML_ASSERT(kq_pos == nullptr && "ALiBi is not yet supported with Flash Attention");
|
6407
6518
|
|
6408
|
-
//
|
6409
|
-
|
6519
|
+
// split cached v into n_head heads (not transposed)
|
6520
|
+
struct ggml_tensor * v =
|
6521
|
+
ggml_view_3d(ctx, kv.v_l[il],
|
6522
|
+
n_embd_head_v, n_kv, n_head_kv,
|
6523
|
+
ggml_row_size(kv.v_l[il]->type, n_embd_k_gqa),
|
6524
|
+
ggml_row_size(kv.v_l[il]->type, n_embd_head_k),
|
6525
|
+
0);
|
6526
|
+
cb(v, "v", il);
|
6410
6527
|
|
6411
|
-
|
6412
|
-
|
6413
|
-
|
6528
|
+
cur = ggml_flash_attn_ext(ctx, q, k, v, kq_mask, kq_scale);
|
6529
|
+
|
6530
|
+
if (model.arch == LLM_ARCH_PHI2 || model.arch == LLM_ARCH_PHI3) {
|
6531
|
+
ggml_flash_attn_ext_set_prec(cur, GGML_PREC_F32);
|
6532
|
+
}
|
6533
|
+
|
6534
|
+
cur = ggml_reshape_2d(ctx, cur, n_embd_head_k*n_head, n_tokens);
|
6535
|
+
} else {
|
6536
|
+
struct ggml_tensor * kq = ggml_mul_mat(ctx, k, q);
|
6537
|
+
cb(kq, "kq", il);
|
6538
|
+
|
6539
|
+
if (model.arch == LLM_ARCH_PHI2 || model.arch == LLM_ARCH_PHI3) {
|
6540
|
+
// for this arch, we need to perform the KQ multiplication with F32 precision, otherwise we get NaNs
|
6541
|
+
// ref: https://github.com/ggerganov/llama.cpp/pull/4490#issuecomment-1859055847
|
6542
|
+
ggml_mul_mat_set_prec(kq, GGML_PREC_F32);
|
6543
|
+
}
|
6544
|
+
|
6545
|
+
if (model.arch == LLM_ARCH_GROK) {
|
6546
|
+
// need to do the following:
|
6547
|
+
// multiply by attn_output_multiplyer of 0.08838834764831845
|
6548
|
+
// and then :
|
6549
|
+
// kq = 30 * tanh(kq / 30)
|
6550
|
+
// before the softmax below
|
6551
|
+
|
6552
|
+
//try from phi2
|
6553
|
+
//ggml_mul_mat_set_prec(kq, GGML_PREC_F32);
|
6554
|
+
|
6555
|
+
kq = ggml_tanh(ctx, ggml_scale(ctx, kq, 0.08838834764831845f/30.0f));
|
6556
|
+
kq = ggml_scale(ctx, kq, 30);
|
6557
|
+
}
|
6414
6558
|
|
6415
6559
|
#if defined(GGML_USE_KOMPUTE)
|
6416
6560
|
#pragma message("TODO: ALiBi support in ggml_soft_max_ext is not implemented for Kompute")
|
6417
6561
|
#pragma message(" Falling back to ggml_alibi(). Will become an error in Mar 2024")
|
6418
6562
|
#pragma message("ref: https://github.com/ggerganov/llama.cpp/pull/5488")
|
6419
|
-
|
6420
|
-
|
6421
|
-
|
6563
|
+
if (hparams.use_alibi) {
|
6564
|
+
kq = ggml_scale(ctx, kq, kq_scale);
|
6565
|
+
cb(kq, "kq_scaled", il);
|
6422
6566
|
|
6423
|
-
|
6424
|
-
|
6567
|
+
kq = ggml_alibi(ctx, kq, /*n_past*/ 0, n_head, hparams.f_max_alibi_bias);
|
6568
|
+
cb(kq, "kq_scaled_alibi", il);
|
6425
6569
|
|
6426
|
-
|
6427
|
-
|
6570
|
+
kq = ggml_add(ctx, kq, kq_mask);
|
6571
|
+
cb(kq, "kq_masked", il);
|
6428
6572
|
|
6429
|
-
|
6430
|
-
|
6431
|
-
|
6573
|
+
kq = ggml_soft_max(ctx, kq);
|
6574
|
+
cb(kq, "kq_soft_max", il);
|
6575
|
+
} else
|
6432
6576
|
#endif
|
6433
|
-
|
6434
|
-
|
6435
|
-
|
6436
|
-
|
6577
|
+
{
|
6578
|
+
kq = ggml_soft_max_ext(ctx, kq, kq_mask, kq_pos, kq_scale, hparams.f_max_alibi_bias);
|
6579
|
+
cb(kq, "kq_soft_max_ext", il);
|
6580
|
+
}
|
6437
6581
|
|
6438
|
-
|
6582
|
+
GGML_ASSERT(kv.size == n_ctx);
|
6439
6583
|
|
6440
|
-
|
6441
|
-
|
6442
|
-
|
6443
|
-
|
6444
|
-
|
6445
|
-
|
6446
|
-
|
6447
|
-
|
6584
|
+
// split cached v into n_head heads
|
6585
|
+
struct ggml_tensor * v =
|
6586
|
+
ggml_view_3d(ctx, kv.v_l[il],
|
6587
|
+
n_kv, n_embd_head_v, n_head_kv,
|
6588
|
+
ggml_element_size(kv.v_l[il])*n_ctx,
|
6589
|
+
ggml_element_size(kv.v_l[il])*n_ctx*n_embd_head_v,
|
6590
|
+
0);
|
6591
|
+
cb(v, "v", il);
|
6448
6592
|
|
6449
|
-
|
6450
|
-
|
6593
|
+
struct ggml_tensor * kqv = ggml_mul_mat(ctx, v, kq);
|
6594
|
+
cb(kqv, "kqv", il);
|
6451
6595
|
|
6452
|
-
|
6453
|
-
|
6596
|
+
struct ggml_tensor * kqv_merged = ggml_permute(ctx, kqv, 0, 2, 1, 3);
|
6597
|
+
cb(kqv_merged, "kqv_merged", il);
|
6454
6598
|
|
6455
|
-
|
6456
|
-
|
6599
|
+
cur = ggml_cont_2d(ctx, kqv_merged, n_embd_head_k*n_head, n_tokens);
|
6600
|
+
cb(cur, "kqv_merged_cont", il);
|
6601
|
+
}
|
6457
6602
|
|
6458
6603
|
ggml_build_forward_expand(graph, cur);
|
6459
6604
|
|
@@ -6473,6 +6618,7 @@ static struct ggml_tensor * llm_build_kv(
|
|
6473
6618
|
struct ggml_context * ctx,
|
6474
6619
|
const llama_model & model,
|
6475
6620
|
const llama_hparams & hparams,
|
6621
|
+
const llama_cparams & cparams,
|
6476
6622
|
const llama_kv_cache & kv,
|
6477
6623
|
struct ggml_cgraph * graph,
|
6478
6624
|
struct ggml_tensor * wo,
|
@@ -6482,7 +6628,6 @@ static struct ggml_tensor * llm_build_kv(
|
|
6482
6628
|
struct ggml_tensor * q_cur,
|
6483
6629
|
struct ggml_tensor * kq_mask,
|
6484
6630
|
struct ggml_tensor * kq_pos,
|
6485
|
-
int64_t n_ctx,
|
6486
6631
|
int32_t n_tokens,
|
6487
6632
|
int32_t kv_head,
|
6488
6633
|
int32_t n_kv,
|
@@ -6496,12 +6641,12 @@ static struct ggml_tensor * llm_build_kv(
|
|
6496
6641
|
ggml_build_forward_expand(graph, k_cur);
|
6497
6642
|
ggml_build_forward_expand(graph, v_cur);
|
6498
6643
|
|
6499
|
-
llm_build_kv_store(ctx, hparams, kv, graph, k_cur, v_cur,
|
6644
|
+
llm_build_kv_store(ctx, hparams, cparams, kv, graph, k_cur, v_cur, n_tokens, kv_head, cb, il);
|
6500
6645
|
|
6501
6646
|
struct ggml_tensor * cur;
|
6502
6647
|
|
6503
|
-
cur = llm_build_kqv(ctx, model, hparams, kv, graph, wo, wo_b,
|
6504
|
-
q_cur, kq_mask, kq_pos,
|
6648
|
+
cur = llm_build_kqv(ctx, model, hparams, cparams, kv, graph, wo, wo_b,
|
6649
|
+
q_cur, kq_mask, kq_pos, n_tokens, n_kv, kq_scale, cb, il);
|
6505
6650
|
cb(cur, "kqv_out", il);
|
6506
6651
|
|
6507
6652
|
return cur;
|
@@ -6543,6 +6688,8 @@ struct llm_build_context {
|
|
6543
6688
|
const int32_t kv_head; // index of where we store new KV data in the cache
|
6544
6689
|
const int32_t n_orig_ctx;
|
6545
6690
|
|
6691
|
+
const bool flash_attn;
|
6692
|
+
|
6546
6693
|
const enum llama_pooling_type pooling_type;
|
6547
6694
|
const enum llama_rope_type rope_type;
|
6548
6695
|
|
@@ -6589,6 +6736,7 @@ struct llm_build_context {
|
|
6589
6736
|
n_outputs (worst_case ? n_tokens : lctx.n_outputs),
|
6590
6737
|
kv_head (worst_case ? (kv_self.recurrent ? 0 : kv_self.size - n_tokens) : kv_self.head),
|
6591
6738
|
n_orig_ctx (cparams.n_yarn_orig_ctx),
|
6739
|
+
flash_attn (cparams.flash_attn),
|
6592
6740
|
pooling_type (cparams.pooling_type),
|
6593
6741
|
rope_type (hparams.rope_type),
|
6594
6742
|
cb (cb),
|
@@ -6703,15 +6851,31 @@ struct llm_build_context {
|
|
6703
6851
|
ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa),
|
6704
6852
|
ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*id));
|
6705
6853
|
|
6706
|
-
ggml_tensor * view_v_src
|
6707
|
-
|
6708
|
-
ggml_row_size(kv_self.v_l[il]->type, kv_self.size),
|
6709
|
-
ggml_row_size(kv_self.v_l[il]->type, i));
|
6854
|
+
ggml_tensor * view_v_src;
|
6855
|
+
ggml_tensor * view_v_dst;
|
6710
6856
|
|
6711
|
-
|
6712
|
-
|
6713
|
-
|
6714
|
-
|
6857
|
+
if (flash_attn) {
|
6858
|
+
// NOTE: the V cache is not transposed when using flash attention
|
6859
|
+
view_v_src = ggml_view_2d(ctx0, kv_self.v_l[il],
|
6860
|
+
n_embd_v_gqa, nm,
|
6861
|
+
ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa),
|
6862
|
+
ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa*i));
|
6863
|
+
|
6864
|
+
view_v_dst = ggml_view_2d(ctx0, kv_self.v_l[il],
|
6865
|
+
n_embd_v_gqa, nm,
|
6866
|
+
ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa),
|
6867
|
+
ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa*id));
|
6868
|
+
} else {
|
6869
|
+
view_v_src = ggml_view_2d(ctx0, kv_self.v_l[il],
|
6870
|
+
nm, n_embd_v_gqa,
|
6871
|
+
ggml_row_size(kv_self.v_l[il]->type, kv_self.size),
|
6872
|
+
ggml_row_size(kv_self.v_l[il]->type, i));
|
6873
|
+
|
6874
|
+
view_v_dst = ggml_view_2d(ctx0, kv_self.v_l[il],
|
6875
|
+
nm, n_embd_v_gqa,
|
6876
|
+
ggml_row_size(kv_self.v_l[il]->type, kv_self.size),
|
6877
|
+
ggml_row_size(kv_self.v_l[il]->type, id));
|
6878
|
+
}
|
6715
6879
|
|
6716
6880
|
ggml_build_forward_expand(gf, ggml_cpy(ctx0, view_k_src, view_k_dst));
|
6717
6881
|
ggml_build_forward_expand(gf, ggml_cpy(ctx0, view_v_src, view_v_dst));
|
@@ -6741,20 +6905,26 @@ struct llm_build_context {
|
|
6741
6905
|
|
6742
6906
|
struct ggml_tensor * build_inp_KQ_mask(bool causal = true) {
|
6743
6907
|
if (causal) {
|
6744
|
-
lctx.inp_KQ_mask = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_kv,
|
6908
|
+
lctx.inp_KQ_mask = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_kv, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD));
|
6745
6909
|
} else {
|
6746
|
-
lctx.inp_KQ_mask = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_tokens, n_tokens);
|
6910
|
+
lctx.inp_KQ_mask = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_tokens, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD));
|
6747
6911
|
}
|
6748
6912
|
cb(lctx.inp_KQ_mask, "KQ_mask", -1);
|
6749
6913
|
ggml_set_input(lctx.inp_KQ_mask);
|
6750
|
-
return lctx.inp_KQ_mask;
|
6914
|
+
return flash_attn ? ggml_cast(ctx0, lctx.inp_KQ_mask, GGML_TYPE_F16) : lctx.inp_KQ_mask;
|
6751
6915
|
}
|
6752
6916
|
|
6753
|
-
struct ggml_tensor * build_inp_KQ_pos() {
|
6754
|
-
|
6917
|
+
struct ggml_tensor * build_inp_KQ_pos(bool causal = true) {
|
6918
|
+
if (causal) {
|
6919
|
+
lctx.inp_KQ_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, n_kv);
|
6920
|
+
} else {
|
6921
|
+
// TODO: this will be needed for ALiBi-based BERT models
|
6922
|
+
// https://github.com/ggerganov/llama.cpp/pull/6826
|
6923
|
+
lctx.inp_KQ_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, n_tokens);
|
6924
|
+
}
|
6755
6925
|
cb(lctx.inp_KQ_pos, "KQ_pos", -1);
|
6756
6926
|
ggml_set_input(lctx.inp_KQ_pos);
|
6757
|
-
return lctx.inp_KQ_pos;
|
6927
|
+
return flash_attn ? ggml_cast(ctx0, lctx.inp_KQ_pos, GGML_TYPE_F16) : lctx.inp_KQ_pos;
|
6758
6928
|
}
|
6759
6929
|
|
6760
6930
|
struct ggml_tensor * build_inp_mean() {
|
@@ -6860,9 +7030,9 @@ struct llm_build_context {
|
|
6860
7030
|
);
|
6861
7031
|
cb(Kcur, "Kcur", il);
|
6862
7032
|
|
6863
|
-
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
7033
|
+
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
6864
7034
|
model.layers[il].wo, model.layers[il].bo,
|
6865
|
-
Kcur, Vcur, Qcur, KQ_mask, nullptr,
|
7035
|
+
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
6866
7036
|
}
|
6867
7037
|
|
6868
7038
|
if (il == n_layer - 1) {
|
@@ -7000,9 +7170,9 @@ struct llm_build_context {
|
|
7000
7170
|
cb(Qcur, "Qcur", il);
|
7001
7171
|
cb(Kcur, "Kcur", il);
|
7002
7172
|
|
7003
|
-
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
7173
|
+
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
7004
7174
|
model.layers[il].wo, NULL,
|
7005
|
-
Kcur, Vcur, Qcur, KQ_mask, KQ_pos,
|
7175
|
+
Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
7006
7176
|
}
|
7007
7177
|
|
7008
7178
|
if (il == n_layer - 1) {
|
@@ -7107,9 +7277,9 @@ struct llm_build_context {
|
|
7107
7277
|
ext_factor, attn_factor, beta_fast, beta_slow
|
7108
7278
|
);
|
7109
7279
|
cb(Kcur, "Kcur", il);
|
7110
|
-
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
7280
|
+
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
7111
7281
|
model.layers[il].wo, NULL,
|
7112
|
-
Kcur, Vcur, Qcur, KQ_mask, KQ_pos,
|
7282
|
+
Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
7113
7283
|
}
|
7114
7284
|
|
7115
7285
|
if (il == n_layer - 1) {
|
@@ -7227,9 +7397,9 @@ struct llm_build_context {
|
|
7227
7397
|
);
|
7228
7398
|
cb(Kcur, "Kcur", il);
|
7229
7399
|
|
7230
|
-
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
7400
|
+
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
7231
7401
|
model.layers[il].wo, NULL,
|
7232
|
-
Kcur, Vcur, Qcur, KQ_mask, nullptr,
|
7402
|
+
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
7233
7403
|
}
|
7234
7404
|
|
7235
7405
|
if (il == n_layer - 1) {
|
@@ -7352,9 +7522,9 @@ struct llm_build_context {
|
|
7352
7522
|
);
|
7353
7523
|
cb(Kcur, "Kcur", il);
|
7354
7524
|
|
7355
|
-
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
7525
|
+
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
7356
7526
|
model.layers[il].wo, model.layers[il].bo,
|
7357
|
-
Kcur, Vcur, Qcur, KQ_mask, nullptr,
|
7527
|
+
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f, cb, il);
|
7358
7528
|
}
|
7359
7529
|
|
7360
7530
|
if (il == n_layer - 1) {
|
@@ -7504,9 +7674,9 @@ struct llm_build_context {
|
|
7504
7674
|
);
|
7505
7675
|
cb(Kcur, "Kcur", il);
|
7506
7676
|
|
7507
|
-
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
7508
|
-
|
7509
|
-
|
7677
|
+
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
7678
|
+
model.layers[il].wo, NULL,
|
7679
|
+
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
7510
7680
|
}
|
7511
7681
|
|
7512
7682
|
if (il == n_layer - 1) {
|
@@ -7616,9 +7786,9 @@ struct llm_build_context {
|
|
7616
7786
|
|
7617
7787
|
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
7618
7788
|
|
7619
|
-
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
7789
|
+
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
7620
7790
|
model.layers[il].wo, model.layers[il].bo,
|
7621
|
-
Kcur, Vcur, Qcur, KQ_mask, nullptr,
|
7791
|
+
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
7622
7792
|
}
|
7623
7793
|
|
7624
7794
|
if (il == n_layer - 1) {
|
@@ -7820,9 +7990,9 @@ struct llm_build_context {
|
|
7820
7990
|
);
|
7821
7991
|
cb(Vcur, "Vcur", il);
|
7822
7992
|
|
7823
|
-
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
7993
|
+
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
7824
7994
|
model.layers[il].wo, model.layers[il].bo,
|
7825
|
-
Kcur, Vcur, Q, KQ_mask, nullptr,
|
7995
|
+
Kcur, Vcur, Q, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
7826
7996
|
}
|
7827
7997
|
|
7828
7998
|
if (il == n_layer - 1) {
|
@@ -7916,9 +8086,9 @@ struct llm_build_context {
|
|
7916
8086
|
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
7917
8087
|
cb(Qcur, "Qcur", il);
|
7918
8088
|
|
7919
|
-
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
8089
|
+
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
7920
8090
|
model.layers[il].wo, NULL,
|
7921
|
-
Kcur, Vcur, Qcur, KQ_mask, KQ_pos,
|
8091
|
+
Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
7922
8092
|
}
|
7923
8093
|
|
7924
8094
|
if (il == n_layer - 1) {
|
@@ -8209,9 +8379,9 @@ struct llm_build_context {
|
|
8209
8379
|
|
8210
8380
|
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
8211
8381
|
|
8212
|
-
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
8382
|
+
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
8213
8383
|
model.layers[il].wo, model.layers[il].bo,
|
8214
|
-
Kcur, Vcur, Qcur, KQ_mask, KQ_pos,
|
8384
|
+
Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
8215
8385
|
}
|
8216
8386
|
|
8217
8387
|
if (il == n_layer - 1) {
|
@@ -8340,14 +8510,15 @@ struct llm_build_context {
|
|
8340
8510
|
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
8341
8511
|
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
8342
8512
|
|
8343
|
-
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
8344
|
-
|
8345
|
-
|
8513
|
+
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
8514
|
+
model.layers[il].wo, model.layers[il].bo,
|
8515
|
+
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
8346
8516
|
} else {
|
8347
8517
|
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
8348
|
-
|
8518
|
+
|
8519
|
+
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
8349
8520
|
model.layers[il].wo, model.layers[il].bo,
|
8350
|
-
Kcur, Vcur, Qcur, KQ_mask, KQ_pos,
|
8521
|
+
Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
8351
8522
|
}
|
8352
8523
|
}
|
8353
8524
|
|
@@ -8489,9 +8660,9 @@ struct llm_build_context {
|
|
8489
8660
|
);
|
8490
8661
|
cb(Kcur, "Kcur", il);
|
8491
8662
|
|
8492
|
-
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
8663
|
+
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
8493
8664
|
model.layers[il].wo, NULL,
|
8494
|
-
Kcur, Vcur, Qcur, KQ_mask, nullptr,
|
8665
|
+
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
8495
8666
|
}
|
8496
8667
|
|
8497
8668
|
if (il == n_layer - 1) {
|
@@ -8607,9 +8778,9 @@ struct llm_build_context {
|
|
8607
8778
|
);
|
8608
8779
|
cb(Kcur, "Kcur", il);
|
8609
8780
|
|
8610
|
-
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
8781
|
+
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
8611
8782
|
model.layers[il].wo, NULL,
|
8612
|
-
Kcur, Vcur, Qcur, KQ_mask, nullptr,
|
8783
|
+
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
8613
8784
|
}
|
8614
8785
|
|
8615
8786
|
if (il == n_layer - 1) {
|
@@ -8720,9 +8891,9 @@ struct llm_build_context {
|
|
8720
8891
|
);
|
8721
8892
|
cb(Kcur, "Kcur", il);
|
8722
8893
|
|
8723
|
-
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
8894
|
+
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
8724
8895
|
model.layers[il].wo, model.layers[il].bo,
|
8725
|
-
Kcur, Vcur, Qcur, KQ_mask, nullptr,
|
8896
|
+
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
8726
8897
|
}
|
8727
8898
|
|
8728
8899
|
if (il == n_layer - 1) {
|
@@ -8834,9 +9005,9 @@ struct llm_build_context {
|
|
8834
9005
|
);
|
8835
9006
|
cb(Kcur, "Kcur", il);
|
8836
9007
|
|
8837
|
-
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
9008
|
+
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
8838
9009
|
model.layers[il].wo, model.layers[il].bo,
|
8839
|
-
Kcur, Vcur, Qcur, KQ_mask, nullptr,
|
9010
|
+
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
8840
9011
|
}
|
8841
9012
|
|
8842
9013
|
if (il == n_layer - 1) {
|
@@ -8989,9 +9160,9 @@ struct llm_build_context {
|
|
8989
9160
|
);
|
8990
9161
|
cb(Kcur, "Kcur", il);
|
8991
9162
|
|
8992
|
-
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
9163
|
+
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
8993
9164
|
model.layers[il].wo, model.layers[il].bo,
|
8994
|
-
Kcur, Vcur, Qcur, KQ_mask, nullptr,
|
9165
|
+
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f, cb, il);
|
8995
9166
|
}
|
8996
9167
|
|
8997
9168
|
if (il == n_layer - 1) {
|
@@ -9106,9 +9277,9 @@ struct llm_build_context {
|
|
9106
9277
|
);
|
9107
9278
|
cb(Kcur, "Kcur", il);
|
9108
9279
|
|
9109
|
-
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
9110
|
-
|
9111
|
-
|
9280
|
+
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
9281
|
+
model.layers[il].wo, model.layers[il].bo,
|
9282
|
+
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f, cb, il);
|
9112
9283
|
}
|
9113
9284
|
|
9114
9285
|
if (il == n_layer - 1) {
|
@@ -9219,9 +9390,9 @@ struct llm_build_context {
|
|
9219
9390
|
ext_factor, attn_factor, beta_fast, beta_slow);
|
9220
9391
|
cb(Kcur, "Kcur", il);
|
9221
9392
|
|
9222
|
-
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
9393
|
+
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
9223
9394
|
model.layers[il].wo, NULL,
|
9224
|
-
Kcur, Vcur, Qcur, KQ_mask, nullptr,
|
9395
|
+
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
9225
9396
|
}
|
9226
9397
|
struct ggml_tensor * sa_out = cur;
|
9227
9398
|
|
@@ -9322,9 +9493,9 @@ struct llm_build_context {
|
|
9322
9493
|
|
9323
9494
|
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
9324
9495
|
|
9325
|
-
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
9496
|
+
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
9326
9497
|
model.layers[il].wo, model.layers[il].bo,
|
9327
|
-
Kcur, Vcur, Qcur, KQ_mask, nullptr,
|
9498
|
+
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
9328
9499
|
}
|
9329
9500
|
|
9330
9501
|
if (il == n_layer - 1) {
|
@@ -9429,9 +9600,9 @@ struct llm_build_context {
|
|
9429
9600
|
);
|
9430
9601
|
cb(Kcur, "Kcur", il);
|
9431
9602
|
|
9432
|
-
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
9603
|
+
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
9433
9604
|
model.layers[il].wo, model.layers[il].bo,
|
9434
|
-
Kcur, Vcur, Qcur, KQ_mask, nullptr,
|
9605
|
+
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
9435
9606
|
}
|
9436
9607
|
|
9437
9608
|
if (il == n_layer - 1) {
|
@@ -9545,9 +9716,9 @@ struct llm_build_context {
|
|
9545
9716
|
);
|
9546
9717
|
cb(Kcur, "Kcur", il);
|
9547
9718
|
|
9548
|
-
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
9719
|
+
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
9549
9720
|
model.layers[il].wo, NULL,
|
9550
|
-
Kcur, Vcur, Qcur, KQ_mask, nullptr,
|
9721
|
+
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
9551
9722
|
}
|
9552
9723
|
|
9553
9724
|
if (il == n_layer - 1) {
|
@@ -9662,9 +9833,9 @@ struct llm_build_context {
|
|
9662
9833
|
);
|
9663
9834
|
cb(Kcur, "Kcur", il);
|
9664
9835
|
|
9665
|
-
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
9836
|
+
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
9666
9837
|
model.layers[il].wo, model.layers[il].bo,
|
9667
|
-
Kcur, Vcur, Qcur, KQ_mask, nullptr,
|
9838
|
+
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
9668
9839
|
}
|
9669
9840
|
|
9670
9841
|
if (il == n_layer - 1) {
|
@@ -9792,9 +9963,9 @@ struct llm_build_context {
|
|
9792
9963
|
);
|
9793
9964
|
cb(Kcur, "Kcur", il);
|
9794
9965
|
|
9795
|
-
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
9966
|
+
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
9796
9967
|
model.layers[il].wo, model.layers[il].bo,
|
9797
|
-
Kcur, Vcur, Qcur, KQ_mask, nullptr,
|
9968
|
+
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
9798
9969
|
}
|
9799
9970
|
|
9800
9971
|
if (il == n_layer - 1) {
|
@@ -9913,9 +10084,9 @@ struct llm_build_context {
|
|
9913
10084
|
ext_factor, attn_factor, beta_fast, beta_slow);
|
9914
10085
|
cb(Kcur, "Kcur", il);
|
9915
10086
|
|
9916
|
-
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
10087
|
+
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
9917
10088
|
model.layers[il].wo, NULL,
|
9918
|
-
Kcur, Vcur, Qcur, KQ_mask, nullptr,
|
10089
|
+
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f, cb, il);
|
9919
10090
|
}
|
9920
10091
|
|
9921
10092
|
if (il == n_layer - 1) {
|
@@ -10032,9 +10203,9 @@ struct llm_build_context {
|
|
10032
10203
|
);
|
10033
10204
|
cb(Kcur, "Kcur", il);
|
10034
10205
|
|
10035
|
-
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
10206
|
+
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
10036
10207
|
model.layers[il].wo, model.layers[il].bo,
|
10037
|
-
Kcur, Vcur, Qcur, KQ_mask, nullptr,
|
10208
|
+
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
10038
10209
|
}
|
10039
10210
|
|
10040
10211
|
if (il == n_layer - 1) {
|
@@ -10322,9 +10493,9 @@ struct llm_build_context {
|
|
10322
10493
|
);
|
10323
10494
|
cb(Kcur, "Kcur", il);
|
10324
10495
|
|
10325
|
-
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
10496
|
+
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
10326
10497
|
model.layers[il].wo, model.layers[il].bo,
|
10327
|
-
Kcur, Vcur, Qcur, KQ_mask, nullptr,
|
10498
|
+
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
10328
10499
|
}
|
10329
10500
|
|
10330
10501
|
if (il == n_layer - 1) {
|
@@ -10453,9 +10624,9 @@ struct llm_build_context {
|
|
10453
10624
|
);
|
10454
10625
|
cb(Kcur, "Kcur", il);
|
10455
10626
|
|
10456
|
-
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
10627
|
+
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
10457
10628
|
model.layers[il].wo, nullptr,
|
10458
|
-
Kcur, Vcur, Qcur, KQ_mask, nullptr,
|
10629
|
+
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
10459
10630
|
}
|
10460
10631
|
|
10461
10632
|
if (il == n_layer - 1) {
|
@@ -10882,7 +11053,9 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
|
|
10882
11053
|
}
|
10883
11054
|
}
|
10884
11055
|
|
10885
|
-
|
11056
|
+
// ALiBi requires the KQ_pos tensor to provide the sequence position of each token in the batch
|
11057
|
+
// this allows to process multiple sequences in parallel with ALiBi-based models
|
11058
|
+
if (hparams.use_alibi) {
|
10886
11059
|
const int64_t n_kv = kv_self.n;
|
10887
11060
|
|
10888
11061
|
GGML_ASSERT(lctx.inp_KQ_pos);
|
@@ -11264,7 +11437,7 @@ static int llama_decode_internal(
|
|
11264
11437
|
// a heuristic, to avoid attending the full cache if it is not yet utilized
|
11265
11438
|
// after enough generations, the benefit from this heuristic disappears
|
11266
11439
|
// if we start defragmenting the cache, the benefit from this will be more important
|
11267
|
-
kv_self.n = std::min(kv_self.size, std::max(
|
11440
|
+
kv_self.n = std::min(kv_self.size, std::max(256u, GGML_PAD(llama_kv_cache_cell_max(kv_self), 256)));
|
11268
11441
|
//kv_self.n = llama_kv_cache_cell_max(kv_self);
|
11269
11442
|
}
|
11270
11443
|
}
|
@@ -11432,6 +11605,10 @@ static int llama_decode_internal(
|
|
11432
11605
|
}
|
11433
11606
|
}
|
11434
11607
|
|
11608
|
+
// Reset state for the next token before backend sync, to allow the CPU activities in the reset to
|
11609
|
+
// overlap with device computation.
|
11610
|
+
ggml_backend_sched_reset(lctx.sched);
|
11611
|
+
|
11435
11612
|
return 0;
|
11436
11613
|
}
|
11437
11614
|
|
@@ -11457,7 +11634,9 @@ static void llama_kv_cache_defrag_internal(struct llama_context & lctx) {
|
|
11457
11634
|
// each move requires 6*n_layer tensors (see build_defrag)
|
11458
11635
|
// - source view, destination view, copy operation
|
11459
11636
|
// - x2 for keys and values
|
11460
|
-
const uint32_t max_moves = LLAMA_MAX_NODES/(6*n_layer);
|
11637
|
+
//const uint32_t max_moves = LLAMA_MAX_NODES/(6*n_layer);
|
11638
|
+
// TODO: tmp fix https://github.com/ggerganov/llama.cpp/issues/6685#issuecomment-2057579516
|
11639
|
+
const uint32_t max_moves = (LLAMA_MAX_NODES - 2*n_layer)/(6*n_layer);
|
11461
11640
|
|
11462
11641
|
// determine which KV cells to move where
|
11463
11642
|
//
|
@@ -11781,7 +11960,7 @@ static uint8_t llama_token_to_byte(const llama_vocab& vocab, llama_token id) {
|
|
11781
11960
|
}
|
11782
11961
|
case LLAMA_VOCAB_TYPE_BPE: {
|
11783
11962
|
GGML_ASSERT(false);
|
11784
|
-
return unicode_utf8_to_byte(token_data.text);
|
11963
|
+
return unicode_utf8_to_byte(token_data.text); // TODO: why is this here after GGML_ASSERT?
|
11785
11964
|
}
|
11786
11965
|
case LLAMA_VOCAB_TYPE_WPM: {
|
11787
11966
|
GGML_ASSERT(false);
|
@@ -12003,7 +12182,79 @@ struct llm_tokenizer_bpe {
|
|
12003
12182
|
|
12004
12183
|
void tokenize(const std::string & text, std::vector<llama_vocab::id> & output) {
|
12005
12184
|
int final_prev_index = -1;
|
12006
|
-
|
12185
|
+
|
12186
|
+
std::vector<std::string> word_collection;
|
12187
|
+
switch (vocab.type) {
|
12188
|
+
case LLAMA_VOCAB_TYPE_BPE:
|
12189
|
+
switch (vocab.type_pre) {
|
12190
|
+
case LLAMA_VOCAB_PRE_TYPE_LLAMA3:
|
12191
|
+
word_collection = unicode_regex_split(text, {
|
12192
|
+
// original regex from tokenizer.json
|
12193
|
+
//"(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
|
12194
|
+
|
12195
|
+
// adapted: https://github.com/ggerganov/llama.cpp/pull/6920#issuecomment-2080233989
|
12196
|
+
"(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
|
12197
|
+
});
|
12198
|
+
break;
|
12199
|
+
case LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM:
|
12200
|
+
word_collection = unicode_regex_split(text, {
|
12201
|
+
"[\r\n]",
|
12202
|
+
"\\s?[A-Za-zµÀ-ÖØ-öø-ƺƼ-ƿDŽ-ʓʕ-ʯͰ-ͳͶͷͻ-ͽͿΆΈ-ΊΌΎ-ΡΣ-ϵϷ-ҁҊ-ԯԱ-ՖႠ-ჅᎠ-Ᏽᏸ-ᏽᲐ-ᲺᲽ-Ჿᴀ-ᴫᵫ-ᵷᵹ-ᶚḀ-ἕἘ-Ἕἠ-ὅὈ-Ὅὐ-ὗὙὛὝὟ-ώᾀ-ᾴᾶ-ᾼιῂ-ῄῆ-ῌῐ-ΐῖ-Ίῠ-Ῥῲ-ῴῶ-ῼℂℇℊ-ℓℕℙ-ℝℤΩℨK-ℭℯ-ℴℹℼ-ℿⅅ-ⅉⅎↃↄⰀ-ⱻⱾ-ⳤⳫ-ⳮⳲⳳꙀ-ꙭꚀ-ꚛꜢ-ꝯꝱ-ꞇꞋ-ꞎꭰ-ꮿff-stﬓ-ﬗA-Za-z𐐀-𐑏𐒰-𐓓𐓘-𐓻𐲀-𐲲𐳀-𐳲𑢠-𑣟𞤀-𞥃]+",
|
12203
|
+
"\\s?[!-/:-~!-/:-~‘-‟ -。]+",
|
12204
|
+
"\\s+$",
|
12205
|
+
"[一-龥ࠀ-一가-]+",
|
12206
|
+
"\\p{N}+",
|
12207
|
+
});
|
12208
|
+
break;
|
12209
|
+
case LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER:
|
12210
|
+
word_collection = unicode_regex_split(text, {
|
12211
|
+
"[\r\n]",
|
12212
|
+
"\\s?\\p{L}+",
|
12213
|
+
"\\s?\\p{P}+",
|
12214
|
+
"[一-龥ࠀ-一가-]+",
|
12215
|
+
"\\p{N}+",
|
12216
|
+
});
|
12217
|
+
break;
|
12218
|
+
case LLAMA_VOCAB_PRE_TYPE_FALCON:
|
12219
|
+
word_collection = unicode_regex_split(text, {
|
12220
|
+
"[\\p{P}\\$\\+<=>\\^~\\|]+",
|
12221
|
+
"'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
|
12222
|
+
"\\p{N}+",
|
12223
|
+
"[0-9][0-9][0-9]",
|
12224
|
+
});
|
12225
|
+
break;
|
12226
|
+
case LLAMA_VOCAB_PRE_TYPE_MPT:
|
12227
|
+
// TODO: MPT pre-tokenization regexes are unknown
|
12228
|
+
// the following are close, but not exact. run the following:
|
12229
|
+
// ./bin/test-tokenizer-0 ../models/ggml-vocab-mpt.gguf
|
12230
|
+
GGML_ASSERT("MPT pre-tokenization regexes are unknown - fixes needed");
|
12231
|
+
word_collection = unicode_regex_split(text, {
|
12232
|
+
"\\s?\\p{L}+",
|
12233
|
+
"\\s?\\p{P}+",
|
12234
|
+
"'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
|
12235
|
+
});
|
12236
|
+
break;
|
12237
|
+
case LLAMA_VOCAB_PRE_TYPE_STARCODER:
|
12238
|
+
case LLAMA_VOCAB_PRE_TYPE_GPT2:
|
12239
|
+
word_collection = unicode_regex_split(text, {
|
12240
|
+
"'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
|
12241
|
+
});
|
12242
|
+
break;
|
12243
|
+
default:
|
12244
|
+
// default regex for BPE tokenization pre-processing
|
12245
|
+
word_collection = unicode_regex_split(text, {
|
12246
|
+
"[\\p{P}\\$\\+<=>\\^~\\|]+",
|
12247
|
+
"'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
|
12248
|
+
"\\p{N}+",
|
12249
|
+
"[0-9][0-9][0-9]",
|
12250
|
+
});
|
12251
|
+
break;
|
12252
|
+
}
|
12253
|
+
break;
|
12254
|
+
default:
|
12255
|
+
GGML_ASSERT(false);
|
12256
|
+
break;
|
12257
|
+
}
|
12007
12258
|
|
12008
12259
|
symbols_final.clear();
|
12009
12260
|
|
@@ -12130,145 +12381,6 @@ private:
|
|
12130
12381
|
work_queue.push(bigram);
|
12131
12382
|
}
|
12132
12383
|
|
12133
|
-
std::vector<std::string> bpe_gpt2_preprocess(const std::string & text) {
|
12134
|
-
std::vector<std::string> bpe_words;
|
12135
|
-
std::vector<std::string> bpe_encoded_words;
|
12136
|
-
|
12137
|
-
std::string token = "";
|
12138
|
-
// GPT2 system regex: 's|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+
|
12139
|
-
bool collecting_numeric = false;
|
12140
|
-
bool collecting_letter = false;
|
12141
|
-
bool collecting_special = false;
|
12142
|
-
bool collecting_whitespace_lookahead = false;
|
12143
|
-
bool collecting = false;
|
12144
|
-
|
12145
|
-
std::vector<std::string> text_utf;
|
12146
|
-
text_utf.reserve(text.size());
|
12147
|
-
bpe_words.reserve(text.size());
|
12148
|
-
bpe_encoded_words.reserve(text.size());
|
12149
|
-
|
12150
|
-
const auto cpts = unicode_cpts_from_utf8(text);
|
12151
|
-
for (size_t i = 0; i < cpts.size(); ++i)
|
12152
|
-
text_utf.emplace_back(unicode_cpt_to_utf8(cpts[i]));
|
12153
|
-
|
12154
|
-
for (int i = 0; i < (int)text_utf.size(); i++) {
|
12155
|
-
const std::string & utf_char = text_utf[i];
|
12156
|
-
bool split_condition = false;
|
12157
|
-
int bytes_remain = text_utf.size() - i;
|
12158
|
-
// forward backward lookups
|
12159
|
-
const std::string & utf_char_next = (i + 1 < (int)text_utf.size()) ? text_utf[i + 1] : "";
|
12160
|
-
const std::string & utf_char_next_next = (i + 2 < (int)text_utf.size()) ? text_utf[i + 2] : "";
|
12161
|
-
|
12162
|
-
// handling contractions
|
12163
|
-
if (!split_condition && bytes_remain >= 2) {
|
12164
|
-
// 's|'t|'m|'d
|
12165
|
-
if (utf_char == "\'" && (utf_char_next == "s" || utf_char_next == "t" || utf_char_next == "m" || utf_char_next == "d")) {
|
12166
|
-
split_condition = true;
|
12167
|
-
}
|
12168
|
-
if (split_condition) {
|
12169
|
-
if (token.size()) {
|
12170
|
-
bpe_words.emplace_back(token); // push previous content as token
|
12171
|
-
}
|
12172
|
-
token = utf_char + utf_char_next;
|
12173
|
-
bpe_words.emplace_back(token);
|
12174
|
-
token = "";
|
12175
|
-
i++;
|
12176
|
-
continue;
|
12177
|
-
}
|
12178
|
-
}
|
12179
|
-
if (!split_condition && bytes_remain >= 3) {
|
12180
|
-
// 're|'ve|'ll
|
12181
|
-
if (utf_char == "\'" && (
|
12182
|
-
(utf_char_next == "r" && utf_char_next_next == "e") ||
|
12183
|
-
(utf_char_next == "v" && utf_char_next_next == "e") ||
|
12184
|
-
(utf_char_next == "l" && utf_char_next_next == "l"))
|
12185
|
-
) {
|
12186
|
-
split_condition = true;
|
12187
|
-
}
|
12188
|
-
if (split_condition) {
|
12189
|
-
// current token + next token can be defined
|
12190
|
-
if (token.size()) {
|
12191
|
-
bpe_words.emplace_back(token); // push previous content as token
|
12192
|
-
}
|
12193
|
-
token = utf_char + utf_char_next + utf_char_next_next;
|
12194
|
-
bpe_words.emplace_back(token); // the contraction
|
12195
|
-
token = "";
|
12196
|
-
i += 2;
|
12197
|
-
continue;
|
12198
|
-
}
|
12199
|
-
}
|
12200
|
-
|
12201
|
-
if (!split_condition && !collecting) {
|
12202
|
-
if (unicode_cpt_type(utf_char) == CODEPOINT_TYPE_LETTER || (!token.size() && utf_char == " " && unicode_cpt_type(utf_char_next) == CODEPOINT_TYPE_LETTER)) {
|
12203
|
-
collecting_letter = true;
|
12204
|
-
collecting = true;
|
12205
|
-
}
|
12206
|
-
else if (unicode_cpt_type(utf_char) == CODEPOINT_TYPE_DIGIT || (!token.size() && utf_char == " " && unicode_cpt_type(utf_char_next) == CODEPOINT_TYPE_DIGIT)) {
|
12207
|
-
collecting_numeric = true;
|
12208
|
-
collecting = true;
|
12209
|
-
}
|
12210
|
-
else if (
|
12211
|
-
((unicode_cpt_type(utf_char) != CODEPOINT_TYPE_LETTER && unicode_cpt_type(utf_char) != CODEPOINT_TYPE_DIGIT) && (unicode_cpt_type(utf_char) != CODEPOINT_TYPE_WHITESPACE)) ||
|
12212
|
-
(!token.size() && utf_char == " " && unicode_cpt_type(utf_char_next) != CODEPOINT_TYPE_LETTER && unicode_cpt_type(utf_char_next) != CODEPOINT_TYPE_DIGIT && unicode_cpt_type(utf_char_next) != CODEPOINT_TYPE_WHITESPACE)
|
12213
|
-
) {
|
12214
|
-
collecting_special = true;
|
12215
|
-
collecting = true;
|
12216
|
-
}
|
12217
|
-
else if (unicode_cpt_type(utf_char) == CODEPOINT_TYPE_WHITESPACE && unicode_cpt_type(utf_char_next) == CODEPOINT_TYPE_WHITESPACE) {
|
12218
|
-
collecting_whitespace_lookahead = true;
|
12219
|
-
collecting = true;
|
12220
|
-
}
|
12221
|
-
else if (unicode_cpt_type(utf_char) == CODEPOINT_TYPE_WHITESPACE) {
|
12222
|
-
split_condition = true;
|
12223
|
-
}
|
12224
|
-
}
|
12225
|
-
else if (!split_condition && collecting) {
|
12226
|
-
if (collecting_letter && unicode_cpt_type(utf_char) != CODEPOINT_TYPE_LETTER) {
|
12227
|
-
split_condition = true;
|
12228
|
-
}
|
12229
|
-
else if (collecting_numeric && unicode_cpt_type(utf_char) != CODEPOINT_TYPE_DIGIT) {
|
12230
|
-
split_condition = true;
|
12231
|
-
}
|
12232
|
-
else if (collecting_special && (unicode_cpt_type(utf_char) == CODEPOINT_TYPE_LETTER || unicode_cpt_type(utf_char) == CODEPOINT_TYPE_DIGIT || unicode_cpt_type(utf_char) == CODEPOINT_TYPE_WHITESPACE)) {
|
12233
|
-
split_condition = true;
|
12234
|
-
}
|
12235
|
-
else if (collecting_whitespace_lookahead && (unicode_cpt_type(utf_char_next) == CODEPOINT_TYPE_LETTER || unicode_cpt_type(utf_char_next) == CODEPOINT_TYPE_DIGIT)) {
|
12236
|
-
split_condition = true;
|
12237
|
-
}
|
12238
|
-
}
|
12239
|
-
|
12240
|
-
if (utf_char_next == "") {
|
12241
|
-
split_condition = true; // final
|
12242
|
-
token += utf_char;
|
12243
|
-
}
|
12244
|
-
|
12245
|
-
if (split_condition) {
|
12246
|
-
if (token.size()) {
|
12247
|
-
bpe_words.emplace_back(token);
|
12248
|
-
}
|
12249
|
-
token = utf_char;
|
12250
|
-
collecting = false;
|
12251
|
-
collecting_letter = false;
|
12252
|
-
collecting_numeric = false;
|
12253
|
-
collecting_special = false;
|
12254
|
-
collecting_whitespace_lookahead = false;
|
12255
|
-
}
|
12256
|
-
else {
|
12257
|
-
token += utf_char;
|
12258
|
-
}
|
12259
|
-
}
|
12260
|
-
|
12261
|
-
for (std::string & word : bpe_words) {
|
12262
|
-
std::string encoded_token = "";
|
12263
|
-
for (char & c : word) {
|
12264
|
-
encoded_token += unicode_byte_to_utf8(c);
|
12265
|
-
}
|
12266
|
-
bpe_encoded_words.emplace_back(encoded_token);
|
12267
|
-
}
|
12268
|
-
|
12269
|
-
return bpe_encoded_words;
|
12270
|
-
}
|
12271
|
-
|
12272
12384
|
const llama_vocab & vocab;
|
12273
12385
|
|
12274
12386
|
std::vector<llm_symbol> symbols;
|
@@ -12588,7 +12700,7 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
|
|
12588
12700
|
} break;
|
12589
12701
|
case LLAMA_VOCAB_TYPE_BPE:
|
12590
12702
|
{
|
12591
|
-
if (add_special && vocab.special_add_bos
|
12703
|
+
if (add_special && vocab.special_add_bos != 0) {
|
12592
12704
|
GGML_ASSERT(vocab.special_bos_id != -1);
|
12593
12705
|
output.push_back(vocab.special_bos_id);
|
12594
12706
|
}
|
@@ -14360,14 +14472,20 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
|
|
14360
14472
|
}
|
14361
14473
|
|
14362
14474
|
static size_t llama_tensor_quantize_internal(enum ggml_type new_type, const float * f32_data, void * new_data, const int64_t chunk_size, int64_t nrows, int64_t n_per_row, const float * imatrix, std::vector<std::thread> & workers, const int nthread) {
|
14363
|
-
std::mutex mutex;
|
14364
|
-
int64_t counter = 0;
|
14365
|
-
size_t new_size = 0;
|
14366
14475
|
if (nthread < 2) {
|
14367
14476
|
// single-thread
|
14368
|
-
|
14477
|
+
size_t new_size = ggml_quantize_chunk(new_type, f32_data, new_data, 0, nrows, n_per_row, imatrix);
|
14478
|
+
if (!ggml_validate_row_data(new_type, new_data, new_size)) {
|
14479
|
+
throw std::runtime_error("quantized data validation failed");
|
14480
|
+
}
|
14481
|
+
return new_size;
|
14369
14482
|
}
|
14370
|
-
|
14483
|
+
|
14484
|
+
std::mutex mutex;
|
14485
|
+
int64_t counter = 0;
|
14486
|
+
size_t new_size = 0;
|
14487
|
+
bool valid = true;
|
14488
|
+
auto compute = [&mutex, &counter, &new_size, &valid, new_type, f32_data, new_data, chunk_size,
|
14371
14489
|
nrows, n_per_row, imatrix]() {
|
14372
14490
|
const int64_t nrows_per_chunk = chunk_size / n_per_row;
|
14373
14491
|
size_t local_size = 0;
|
@@ -14382,7 +14500,17 @@ static size_t llama_tensor_quantize_internal(enum ggml_type new_type, const floa
|
|
14382
14500
|
}
|
14383
14501
|
lock.unlock();
|
14384
14502
|
const int64_t this_nrow = std::min(nrows - first_row, nrows_per_chunk);
|
14385
|
-
|
14503
|
+
size_t this_size = ggml_quantize_chunk(new_type, f32_data, new_data, first_row * n_per_row, this_nrow, n_per_row, imatrix);
|
14504
|
+
local_size += this_size;
|
14505
|
+
|
14506
|
+
// validate the quantized data
|
14507
|
+
const size_t row_size = ggml_row_size(new_type, n_per_row);
|
14508
|
+
void * this_data = (char *) new_data + first_row * row_size;
|
14509
|
+
if (!ggml_validate_row_data(new_type, this_data, this_size)) {
|
14510
|
+
std::unique_lock<std::mutex> lock(mutex);
|
14511
|
+
valid = false;
|
14512
|
+
break;
|
14513
|
+
}
|
14386
14514
|
}
|
14387
14515
|
};
|
14388
14516
|
for (int it = 0; it < nthread - 1; ++it) {
|
@@ -14391,6 +14519,9 @@ static size_t llama_tensor_quantize_internal(enum ggml_type new_type, const floa
|
|
14391
14519
|
compute();
|
14392
14520
|
for (auto & w : workers) { w.join(); }
|
14393
14521
|
workers.clear();
|
14522
|
+
if (!valid) {
|
14523
|
+
throw std::runtime_error("quantized data validation failed");
|
14524
|
+
}
|
14394
14525
|
return new_size;
|
14395
14526
|
}
|
14396
14527
|
|
@@ -14453,7 +14584,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
14453
14584
|
auto v = (std::vector<llama_model_kv_override>*)params->kv_overrides;
|
14454
14585
|
kv_overrides = v->data();
|
14455
14586
|
}
|
14456
|
-
llama_model_loader ml(fname_inp, use_mmap, kv_overrides);
|
14587
|
+
llama_model_loader ml(fname_inp, use_mmap, /*check_tensors*/ true, kv_overrides);
|
14457
14588
|
ml.init_mappings(false); // no prefetching
|
14458
14589
|
|
14459
14590
|
llama_model model;
|
@@ -14491,11 +14622,13 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
14491
14622
|
for (auto & o : overrides) {
|
14492
14623
|
if (o.key[0] == 0) break;
|
14493
14624
|
if (o.tag == LLAMA_KV_OVERRIDE_TYPE_FLOAT) {
|
14494
|
-
gguf_set_val_f32(ctx_out, o.key, o.
|
14625
|
+
gguf_set_val_f32(ctx_out, o.key, o.val_f64);
|
14495
14626
|
} else if (o.tag == LLAMA_KV_OVERRIDE_TYPE_INT) {
|
14496
|
-
gguf_set_val_i32(ctx_out, o.key, o.
|
14627
|
+
gguf_set_val_i32(ctx_out, o.key, o.val_i64);
|
14497
14628
|
} else if (o.tag == LLAMA_KV_OVERRIDE_TYPE_BOOL) {
|
14498
|
-
gguf_set_val_bool(ctx_out, o.key, o.
|
14629
|
+
gguf_set_val_bool(ctx_out, o.key, o.val_bool);
|
14630
|
+
} else if (o.tag == LLAMA_KV_OVERRIDE_TYPE_STR) {
|
14631
|
+
gguf_set_val_str(ctx_out, o.key, o.val_str);
|
14499
14632
|
} else {
|
14500
14633
|
LLAMA_LOG_WARN("%s: unknown KV override type for key %s\n", __func__, o.key);
|
14501
14634
|
}
|
@@ -14814,7 +14947,7 @@ static int llama_apply_lora_from_file_internal(
|
|
14814
14947
|
std::unique_ptr<llama_model_loader> ml;
|
14815
14948
|
if (path_base_model) {
|
14816
14949
|
LLAMA_LOG_INFO("%s: loading base model from '%s'\n", __func__, path_base_model);
|
14817
|
-
ml.reset(new llama_model_loader(path_base_model, /*use_mmap*/ true, /*kv_overrides*/ nullptr));
|
14950
|
+
ml.reset(new llama_model_loader(path_base_model, /*use_mmap*/ true, /*check_tensors*/ false, /*kv_overrides*/ nullptr));
|
14818
14951
|
ml->init_mappings(/*prefetch*/ false); // no prefetching
|
14819
14952
|
}
|
14820
14953
|
|
@@ -15073,6 +15206,7 @@ struct llama_model_params llama_model_default_params() {
|
|
15073
15206
|
/*.vocab_only =*/ false,
|
15074
15207
|
/*.use_mmap =*/ true,
|
15075
15208
|
/*.use_mlock =*/ false,
|
15209
|
+
/*.check_tensors =*/ false,
|
15076
15210
|
};
|
15077
15211
|
|
15078
15212
|
#ifdef GGML_USE_METAL
|
@@ -15109,6 +15243,7 @@ struct llama_context_params llama_context_default_params() {
|
|
15109
15243
|
/*.logits_all =*/ false,
|
15110
15244
|
/*.embeddings =*/ false,
|
15111
15245
|
/*.offload_kqv =*/ true,
|
15246
|
+
/*.flash_attn =*/ false,
|
15112
15247
|
/*.abort_callback =*/ nullptr,
|
15113
15248
|
/*.abort_callback_data =*/ nullptr,
|
15114
15249
|
};
|
@@ -15275,6 +15410,7 @@ struct llama_context * llama_new_context_with_model(
|
|
15275
15410
|
cparams.defrag_thold = params.defrag_thold;
|
15276
15411
|
cparams.embeddings = params.embeddings;
|
15277
15412
|
cparams.offload_kqv = params.offload_kqv;
|
15413
|
+
cparams.flash_attn = params.flash_attn;
|
15278
15414
|
cparams.pooling_type = params.pooling_type;
|
15279
15415
|
|
15280
15416
|
cparams.n_ctx = params.n_ctx == 0 ? hparams.n_ctx_train : params.n_ctx;
|
@@ -15282,12 +15418,20 @@ struct llama_context * llama_new_context_with_model(
|
|
15282
15418
|
cparams.rope_freq_scale = params.rope_freq_scale == 0.0f ? hparams.rope_freq_scale_train : params.rope_freq_scale;
|
15283
15419
|
|
15284
15420
|
// this is necessary due to kv_self.n being padded later during inference
|
15285
|
-
cparams.n_ctx
|
15421
|
+
cparams.n_ctx = GGML_PAD(cparams.n_ctx, 256);
|
15286
15422
|
|
15287
15423
|
// with causal attention, the batch size is limited by the context size
|
15288
15424
|
cparams.n_batch = hparams.causal_attn ? std::min(cparams.n_ctx, params.n_batch) : params.n_batch;
|
15289
|
-
cparams.n_ubatch = std::min(cparams.n_batch, params.n_ubatch == 0 ? params.n_batch : params.n_ubatch);
|
15290
15425
|
|
15426
|
+
// the batch has to be at least GGML_KQ_MASK_PAD because we will be padding the KQ_mask
|
15427
|
+
// this is required by GPU kernels in order to avoid out-of-bounds accesses (e.g. ggml_flash_attn_ext)
|
15428
|
+
// ref: https://github.com/ggerganov/llama.cpp/pull/5021
|
15429
|
+
if (cparams.n_batch < GGML_KQ_MASK_PAD) {
|
15430
|
+
LLAMA_LOG_WARN("%s: n_batch is less than GGML_KQ_MASK_PAD - increasing to %d\n", __func__, GGML_KQ_MASK_PAD);
|
15431
|
+
cparams.n_batch = GGML_KQ_MASK_PAD;
|
15432
|
+
}
|
15433
|
+
|
15434
|
+
cparams.n_ubatch = std::min(cparams.n_batch, params.n_ubatch == 0 ? params.n_batch : params.n_ubatch);
|
15291
15435
|
|
15292
15436
|
cparams.n_yarn_orig_ctx = params.yarn_orig_ctx != 0 ? params.yarn_orig_ctx :
|
15293
15437
|
hparams.n_yarn_orig_ctx != 0 ? hparams.n_yarn_orig_ctx :
|
@@ -15319,6 +15463,23 @@ struct llama_context * llama_new_context_with_model(
|
|
15319
15463
|
}
|
15320
15464
|
}
|
15321
15465
|
|
15466
|
+
if (cparams.flash_attn && hparams.use_alibi) {
|
15467
|
+
LLAMA_LOG_WARN("%s: flash_attn is not yet compatible with ALiBi - forcing off\n", __func__);
|
15468
|
+
cparams.flash_attn = false;
|
15469
|
+
}
|
15470
|
+
|
15471
|
+
if (cparams.flash_attn && model->arch == LLM_ARCH_GROK) {
|
15472
|
+
LLAMA_LOG_WARN("%s: flash_attn is not compatible with Grok - forcing off\n", __func__);
|
15473
|
+
cparams.flash_attn = false;
|
15474
|
+
}
|
15475
|
+
|
15476
|
+
#ifdef GGML_USE_HIPBLAS
|
15477
|
+
if (cparams.flash_attn) {
|
15478
|
+
LLAMA_LOG_WARN("%s: flash_attn is not yet compatible with HIPBLAS builds - forcing off\n", __func__);
|
15479
|
+
cparams.flash_attn = false;
|
15480
|
+
}
|
15481
|
+
#endif
|
15482
|
+
|
15322
15483
|
if (params.seed == LLAMA_DEFAULT_SEED) {
|
15323
15484
|
params.seed = time(NULL);
|
15324
15485
|
}
|
@@ -15326,6 +15487,7 @@ struct llama_context * llama_new_context_with_model(
|
|
15326
15487
|
LLAMA_LOG_INFO("%s: n_ctx = %u\n", __func__, cparams.n_ctx);
|
15327
15488
|
LLAMA_LOG_INFO("%s: n_batch = %u\n", __func__, cparams.n_batch);
|
15328
15489
|
LLAMA_LOG_INFO("%s: n_ubatch = %u\n", __func__, cparams.n_ubatch);
|
15490
|
+
LLAMA_LOG_INFO("%s: flash_attn = %d\n", __func__, cparams.flash_attn);
|
15329
15491
|
LLAMA_LOG_INFO("%s: freq_base = %.1f\n", __func__, cparams.rope_freq_base);
|
15330
15492
|
LLAMA_LOG_INFO("%s: freq_scale = %g\n", __func__, cparams.rope_freq_scale);
|
15331
15493
|
|
@@ -15454,7 +15616,7 @@ struct llama_context * llama_new_context_with_model(
|
|
15454
15616
|
}
|
15455
15617
|
ctx->backends.push_back(ctx->backend_cpu);
|
15456
15618
|
|
15457
|
-
if (!llama_kv_cache_init(ctx->kv_self, ctx
|
15619
|
+
if (!llama_kv_cache_init(ctx->kv_self, ctx, type_k, type_v, kv_size, cparams.offload_kqv)) {
|
15458
15620
|
LLAMA_LOG_ERROR("%s: llama_kv_cache_init() failed for self-attention cache\n", __func__);
|
15459
15621
|
llama_free(ctx);
|
15460
15622
|
return nullptr;
|
@@ -16053,6 +16215,7 @@ size_t llama_state_get_size(const struct llama_context * ctx) {
|
|
16053
16215
|
const size_t s_kv_head = sizeof(uint32_t);
|
16054
16216
|
const size_t s_kv_size = sizeof(uint32_t);
|
16055
16217
|
const size_t s_kv_used = sizeof(uint32_t);
|
16218
|
+
const size_t s_v_trans = sizeof(uint32_t);
|
16056
16219
|
const size_t s_kv = ctx->kv_self.total_size();
|
16057
16220
|
const size_t s_kv_cell = sizeof(llama_pos) + sizeof(size_t) + cparams.n_seq_max*sizeof(llama_seq_id);
|
16058
16221
|
const size_t s_kv_cells = ctx->kv_self.size * s_kv_cell;
|
@@ -16070,10 +16233,14 @@ size_t llama_state_get_size(const struct llama_context * ctx) {
|
|
16070
16233
|
+ s_kv_head
|
16071
16234
|
+ s_kv_size
|
16072
16235
|
+ s_kv_used
|
16236
|
+
+ s_v_trans
|
16073
16237
|
+ s_kv
|
16074
16238
|
+ s_kv_cells
|
16075
16239
|
);
|
16076
16240
|
|
16241
|
+
// on session change it is very likely that the state size has changed - so we need to update this function
|
16242
|
+
static_assert(LLAMA_SESSION_VERSION == 6, "So you just bumped the session version - good. But did you remember to update llama_state_get_size?");
|
16243
|
+
|
16077
16244
|
return s_total;
|
16078
16245
|
}
|
16079
16246
|
|
@@ -16219,11 +16386,13 @@ static void llama_state_get_data_internal(struct llama_context * ctx, llama_data
|
|
16219
16386
|
const uint32_t kv_size = kv_self.size;
|
16220
16387
|
const size_t kv_buf_size = kv_self.total_size() / (kv_size ? kv_size : 1) * kv_head;
|
16221
16388
|
const uint32_t kv_used = kv_self.used;
|
16389
|
+
const uint32_t v_trans = kv_self.v_trans ? 1 : 0;
|
16222
16390
|
|
16223
16391
|
data_ctx->write(&kv_buf_size, sizeof(kv_buf_size));
|
16224
16392
|
data_ctx->write(&kv_head, sizeof(kv_head));
|
16225
16393
|
data_ctx->write(&kv_size, sizeof(kv_size));
|
16226
16394
|
data_ctx->write(&kv_used, sizeof(kv_used));
|
16395
|
+
data_ctx->write(&v_trans, sizeof(v_trans));
|
16227
16396
|
|
16228
16397
|
if (kv_buf_size) {
|
16229
16398
|
const size_t pre_kv_buf_size = data_ctx->get_size_written();
|
@@ -16236,7 +16405,7 @@ static void llama_state_get_data_internal(struct llama_context * ctx, llama_data
|
|
16236
16405
|
ggml_backend_tensor_get(kv_self.k_l[il], tmp_buf.data(), 0, tmp_buf.size());
|
16237
16406
|
data_ctx->write(tmp_buf.data(), tmp_buf.size());
|
16238
16407
|
|
16239
|
-
if (kv_self.recurrent) {
|
16408
|
+
if (kv_self.recurrent || !kv_self.v_trans) {
|
16240
16409
|
// v is contiguous for recurrent models
|
16241
16410
|
// TODO: use other tensors for state models than k and v
|
16242
16411
|
const size_t v_size = ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa*kv_head);
|
@@ -16369,11 +16538,15 @@ size_t llama_state_set_data(struct llama_context * ctx, const uint8_t * src) {
|
|
16369
16538
|
uint32_t kv_head;
|
16370
16539
|
uint32_t kv_size;
|
16371
16540
|
uint32_t kv_used;
|
16541
|
+
uint32_t v_trans;
|
16372
16542
|
|
16373
16543
|
memcpy(&kv_buf_size, inp, sizeof(kv_buf_size)); inp += sizeof(kv_buf_size);
|
16374
16544
|
memcpy(&kv_head, inp, sizeof(kv_head)); inp += sizeof(kv_head);
|
16375
16545
|
memcpy(&kv_size, inp, sizeof(kv_size)); inp += sizeof(kv_size);
|
16376
16546
|
memcpy(&kv_used, inp, sizeof(kv_used)); inp += sizeof(kv_used);
|
16547
|
+
memcpy(&v_trans, inp, sizeof(v_trans)); inp += sizeof(v_trans);
|
16548
|
+
|
16549
|
+
GGML_ASSERT(kv_self.v_trans == (bool) v_trans); // incompatible V transposition
|
16377
16550
|
|
16378
16551
|
if (kv_self.size != kv_size) {
|
16379
16552
|
// the KV cache needs to be big enough to load all the KV cells from the saved state
|
@@ -16383,6 +16556,8 @@ size_t llama_state_set_data(struct llama_context * ctx, const uint8_t * src) {
|
|
16383
16556
|
__func__, kv_head, kv_size, kv_self.size);
|
16384
16557
|
}
|
16385
16558
|
|
16559
|
+
llama_kv_cache_clear(ctx);
|
16560
|
+
|
16386
16561
|
if (kv_buf_size) {
|
16387
16562
|
const size_t pre_kv_buf_size = inp - src;
|
16388
16563
|
|
@@ -16394,7 +16569,7 @@ size_t llama_state_set_data(struct llama_context * ctx, const uint8_t * src) {
|
|
16394
16569
|
ggml_backend_tensor_set(kv_self.k_l[il], inp, 0, k_size);
|
16395
16570
|
inp += k_size;
|
16396
16571
|
|
16397
|
-
if (kv_self.recurrent) {
|
16572
|
+
if (kv_self.recurrent || !kv_self.v_trans) {
|
16398
16573
|
// v is contiguous for recurrent models
|
16399
16574
|
// TODO: use other tensors for state models than k and v
|
16400
16575
|
const size_t v_size = ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa*kv_head);
|
@@ -16416,8 +16591,6 @@ size_t llama_state_set_data(struct llama_context * ctx, const uint8_t * src) {
|
|
16416
16591
|
GGML_ASSERT(kv_buf_size == inp - src - pre_kv_buf_size);
|
16417
16592
|
}
|
16418
16593
|
|
16419
|
-
llama_kv_cache_clear(ctx);
|
16420
|
-
|
16421
16594
|
ctx->kv_self.head = kv_head;
|
16422
16595
|
ctx->kv_self.used = kv_used;
|
16423
16596
|
|
@@ -16677,28 +16850,49 @@ static size_t llama_state_seq_get_data_internal(struct llama_context * ctx, llam
|
|
16677
16850
|
}
|
16678
16851
|
}
|
16679
16852
|
|
16680
|
-
//
|
16681
|
-
|
16682
|
-
|
16683
|
-
|
16684
|
-
|
16685
|
-
|
16853
|
+
// TODO: simplify, reduce copy-paste
|
16854
|
+
if (!kv_self.v_trans) {
|
16855
|
+
for (int il = 0; il < (int)n_layer; ++il) {
|
16856
|
+
// Write value type
|
16857
|
+
const int32_t v_type_i = (int32_t)kv_self.v_l[il]->type;
|
16858
|
+
data_ctx.write(&v_type_i, sizeof(v_type_i));
|
16686
16859
|
|
16687
|
-
|
16688
|
-
|
16689
|
-
|
16860
|
+
// Write row size of value
|
16861
|
+
const size_t v_size_row = ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa);
|
16862
|
+
data_ctx.write(&v_size_row, sizeof(v_size_row));
|
16690
16863
|
|
16691
|
-
|
16692
|
-
for (uint32_t j = 0; j < n_embd_v_gqa; ++j) {
|
16693
|
-
// Read each range of cells of v_size_el length each into tmp_buf and write out
|
16864
|
+
// Read each range of cells of v_size length each into tmp_buf and write out
|
16694
16865
|
for (const auto & range : cell_ranges) {
|
16695
16866
|
const size_t range_size = range.second - range.first;
|
16696
|
-
|
16697
|
-
tmp_buf.
|
16698
|
-
ggml_backend_tensor_get(kv_self.v_l[il], tmp_buf.data(), src_offset, tmp_buf.size());
|
16867
|
+
tmp_buf.resize(range_size * v_size_row);
|
16868
|
+
ggml_backend_tensor_get(kv_self.v_l[il], tmp_buf.data(), range.first * v_size_row, range_size * v_size_row);
|
16699
16869
|
data_ctx.write(tmp_buf.data(), tmp_buf.size());
|
16700
16870
|
}
|
16701
16871
|
}
|
16872
|
+
} else {
|
16873
|
+
// For the values, they are transposed, so we also need the element size and get the element ranges from each row
|
16874
|
+
const uint32_t kv_size = kv_self.size;
|
16875
|
+
for (int il = 0; il < (int)n_layer; ++il) {
|
16876
|
+
// Write value type
|
16877
|
+
const int32_t v_type_i = (int32_t)kv_self.v_l[il]->type;
|
16878
|
+
data_ctx.write(&v_type_i, sizeof(v_type_i));
|
16879
|
+
|
16880
|
+
// Write element size
|
16881
|
+
const size_t v_size_el = ggml_type_size(kv_self.v_l[il]->type);
|
16882
|
+
data_ctx.write(&v_size_el, sizeof(v_size_el));
|
16883
|
+
|
16884
|
+
// For each row, we get the element values of each cell
|
16885
|
+
for (uint32_t j = 0; j < n_embd_v_gqa; ++j) {
|
16886
|
+
// Read each range of cells of v_size_el length each into tmp_buf and write out
|
16887
|
+
for (const auto & range : cell_ranges) {
|
16888
|
+
const size_t range_size = range.second - range.first;
|
16889
|
+
const size_t src_offset = (range.first + j * kv_size) * v_size_el;
|
16890
|
+
tmp_buf.resize(range_size * v_size_el);
|
16891
|
+
ggml_backend_tensor_get(kv_self.v_l[il], tmp_buf.data(), src_offset, tmp_buf.size());
|
16892
|
+
data_ctx.write(tmp_buf.data(), tmp_buf.size());
|
16893
|
+
}
|
16894
|
+
}
|
16895
|
+
}
|
16702
16896
|
}
|
16703
16897
|
|
16704
16898
|
return data_ctx.get_size_written();
|
@@ -16823,41 +17017,75 @@ size_t llama_state_seq_set_data(struct llama_context * ctx, const uint8_t * src,
|
|
16823
17017
|
}
|
16824
17018
|
}
|
16825
17019
|
|
16826
|
-
//
|
16827
|
-
|
16828
|
-
|
16829
|
-
|
16830
|
-
|
16831
|
-
|
16832
|
-
|
16833
|
-
|
16834
|
-
|
16835
|
-
|
16836
|
-
|
16837
|
-
|
17020
|
+
// TODO: simplify, reduce copy-paste
|
17021
|
+
if (!kv_self.v_trans) {
|
17022
|
+
for (int il = 0; il < (int)n_layer; ++il) {
|
17023
|
+
// Read type of value
|
17024
|
+
int32_t v_type_i_ref;
|
17025
|
+
memcpy(&v_type_i_ref, inp, sizeof(v_type_i_ref));
|
17026
|
+
inp += sizeof(v_type_i_ref);
|
17027
|
+
const int32_t v_type_i = (int32_t)kv_self.v_l[il]->type;
|
17028
|
+
if (v_type_i != v_type_i_ref) {
|
17029
|
+
llama_kv_cache_seq_rm(kv_self, dest_seq_id, -1, -1);
|
17030
|
+
LLAMA_LOG_ERROR("%s: mismatched value type (%d != %d, layer %d)\n", __func__, v_type_i, v_type_i_ref, il);
|
17031
|
+
return 0;
|
17032
|
+
}
|
16838
17033
|
|
16839
|
-
|
16840
|
-
|
16841
|
-
|
16842
|
-
|
16843
|
-
|
16844
|
-
|
16845
|
-
|
16846
|
-
|
16847
|
-
|
16848
|
-
|
17034
|
+
// Read row size of value
|
17035
|
+
size_t v_size_row_ref;
|
17036
|
+
memcpy(&v_size_row_ref, inp, sizeof(v_size_row_ref));
|
17037
|
+
inp += sizeof(v_size_row_ref);
|
17038
|
+
const size_t v_size_row = ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa);
|
17039
|
+
if (v_size_row != v_size_row_ref) {
|
17040
|
+
llama_kv_cache_seq_rm(kv_self, dest_seq_id, -1, -1);
|
17041
|
+
LLAMA_LOG_ERROR("%s: mismatched value row size (%zu != %zu, layer %d)\n", __func__, v_size_row, v_size_row_ref, il);
|
17042
|
+
return 0;
|
17043
|
+
}
|
16849
17044
|
|
16850
|
-
|
16851
|
-
|
16852
|
-
|
16853
|
-
|
16854
|
-
|
16855
|
-
|
17045
|
+
if (cell_count) {
|
17046
|
+
// Read and set the values for the whole cell range
|
17047
|
+
ggml_backend_tensor_set(kv_self.v_l[il], inp, kv_head * v_size_row, cell_count * v_size_row);
|
17048
|
+
inp += cell_count * v_size_row;
|
17049
|
+
}
|
17050
|
+
}
|
17051
|
+
} else {
|
17052
|
+
// For each layer, read the values for each cell (transposed)
|
17053
|
+
for (int il = 0; il < (int)n_layer; ++il) {
|
17054
|
+
// Read type of value
|
17055
|
+
int32_t v_type_i_ref;
|
17056
|
+
memcpy(&v_type_i_ref, inp, sizeof(v_type_i_ref));
|
17057
|
+
inp += sizeof(v_type_i_ref);
|
17058
|
+
const int32_t v_type_i = (int32_t)kv_self.v_l[il]->type;
|
17059
|
+
if (v_type_i != v_type_i_ref) {
|
17060
|
+
llama_kv_cache_seq_rm(kv_self, dest_seq_id, -1, -1);
|
17061
|
+
LLAMA_LOG_ERROR("%s: mismatched value type (%d != %d, layer %d)\n", __func__, v_type_i, v_type_i_ref, il);
|
17062
|
+
return 0;
|
17063
|
+
}
|
17064
|
+
|
17065
|
+
// Read element size of value
|
17066
|
+
size_t v_size_el_ref;
|
17067
|
+
memcpy(&v_size_el_ref, inp, sizeof(v_size_el_ref));
|
17068
|
+
inp += sizeof(v_size_el_ref);
|
17069
|
+
const size_t v_size_el = ggml_type_size(kv_self.v_l[il]->type);
|
17070
|
+
if (v_size_el != v_size_el_ref) {
|
17071
|
+
llama_kv_cache_seq_rm(kv_self, dest_seq_id, -1, -1);
|
17072
|
+
LLAMA_LOG_ERROR("%s: mismatched value element size (%zu != %zu, layer %d)\n", __func__, v_size_el, v_size_el_ref, il);
|
17073
|
+
return 0;
|
17074
|
+
}
|
17075
|
+
|
17076
|
+
if (cell_count) {
|
17077
|
+
// For each row in the transposed matrix, read the values for the whole cell range
|
17078
|
+
for (uint32_t j = 0; j < n_embd_v_gqa; ++j) {
|
17079
|
+
const size_t dst_offset = (kv_head + j * kv_size) * v_size_el;
|
17080
|
+
ggml_backend_tensor_set(kv_self.v_l[il], inp, dst_offset, cell_count * v_size_el);
|
17081
|
+
inp += cell_count * v_size_el;
|
17082
|
+
}
|
16856
17083
|
}
|
16857
17084
|
}
|
16858
17085
|
}
|
16859
17086
|
|
16860
17087
|
const size_t nread = inp - src;
|
17088
|
+
|
16861
17089
|
return nread;
|
16862
17090
|
}
|
16863
17091
|
|
@@ -17654,9 +17882,9 @@ const char * llama_print_system_info(void) {
|
|
17654
17882
|
s += "VSX = " + std::to_string(ggml_cpu_has_vsx()) + " | ";
|
17655
17883
|
s += "MATMUL_INT8 = " + std::to_string(ggml_cpu_has_matmul_int8()) + " | ";
|
17656
17884
|
#ifdef GGML_USE_LLAMAFILE
|
17657
|
-
s += "
|
17885
|
+
s += "LLAMAFILE = 1 | ";
|
17658
17886
|
#else
|
17659
|
-
s += "
|
17887
|
+
s += "LLAMAFILE = 0 | ";
|
17660
17888
|
#endif
|
17661
17889
|
|
17662
17890
|
return s.c_str();
|