llama_cpp 0.14.7 → 0.15.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +19 -0
- data/README.md +2 -2
- data/ext/llama_cpp/extconf.rb +2 -1
- data/ext/llama_cpp/llama_cpp.cpp +59 -9
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +24 -3
- data/vendor/tmp/llama.cpp/Makefile +42 -18
- data/vendor/tmp/llama.cpp/ggml-backend.c +7 -5
- data/vendor/tmp/llama.cpp/ggml-cuda.cu +295 -17
- data/vendor/tmp/llama.cpp/ggml-impl.h +78 -1
- data/vendor/tmp/llama.cpp/ggml-kompute.cpp +7 -0
- data/vendor/tmp/llama.cpp/ggml-metal.m +399 -184
- data/vendor/tmp/llama.cpp/ggml-metal.metal +654 -18
- data/vendor/tmp/llama.cpp/ggml-opencl.cpp +1 -0
- data/vendor/tmp/llama.cpp/ggml-quants.c +302 -0
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +28 -16
- data/vendor/tmp/llama.cpp/ggml-vulkan-shaders.hpp +46843 -39205
- data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +951 -263
- data/vendor/tmp/llama.cpp/ggml.c +1457 -92
- data/vendor/tmp/llama.cpp/ggml.h +37 -7
- data/vendor/tmp/llama.cpp/llama.cpp +671 -403
- data/vendor/tmp/llama.cpp/llama.h +34 -10
- data/vendor/tmp/llama.cpp/sgemm.cpp +134 -103
- data/vendor/tmp/llama.cpp/sgemm.h +4 -2
- data/vendor/tmp/llama.cpp/unicode-data.cpp +1188 -656
- data/vendor/tmp/llama.cpp/unicode-data.h +4 -3
- data/vendor/tmp/llama.cpp/unicode.cpp +590 -49
- data/vendor/tmp/llama.cpp/unicode.h +6 -3
- metadata +3 -3
@@ -75,6 +75,7 @@
|
|
75
75
|
#include <forward_list>
|
76
76
|
#include <fstream>
|
77
77
|
#include <functional>
|
78
|
+
#include <future>
|
78
79
|
#include <initializer_list>
|
79
80
|
#include <locale>
|
80
81
|
#include <map>
|
@@ -107,7 +108,6 @@
|
|
107
108
|
#define LLAMA_MAX_NODES 8192
|
108
109
|
#define LLAMA_MAX_EXPERTS 60
|
109
110
|
|
110
|
-
|
111
111
|
//
|
112
112
|
// logging
|
113
113
|
//
|
@@ -316,6 +316,7 @@ enum llm_kv {
|
|
316
316
|
LLM_KV_SSM_TIME_STEP_RANK,
|
317
317
|
|
318
318
|
LLM_KV_TOKENIZER_MODEL,
|
319
|
+
LLM_KV_TOKENIZER_PRE,
|
319
320
|
LLM_KV_TOKENIZER_LIST,
|
320
321
|
LLM_KV_TOKENIZER_TOKEN_TYPE,
|
321
322
|
LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT,
|
@@ -392,6 +393,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
|
392
393
|
{ LLM_KV_SSM_TIME_STEP_RANK, "%s.ssm.time_step_rank" },
|
393
394
|
|
394
395
|
{ LLM_KV_TOKENIZER_MODEL, "tokenizer.ggml.model" },
|
396
|
+
{ LLM_KV_TOKENIZER_PRE, "tokenizer.ggml.pre" },
|
395
397
|
{ LLM_KV_TOKENIZER_LIST, "tokenizer.ggml.tokens" },
|
396
398
|
{ LLM_KV_TOKENIZER_TOKEN_TYPE, "tokenizer.ggml.token_type" },
|
397
399
|
{ LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT, "tokenizer.ggml.token_type_count" },
|
@@ -1843,7 +1845,7 @@ struct llama_hparams {
|
|
1843
1845
|
float f_logit_scale = 0.0f;
|
1844
1846
|
|
1845
1847
|
bool causal_attn = true;
|
1846
|
-
bool
|
1848
|
+
bool use_alibi = false; // currently, we need KQ_pos data for ALiBi-based models
|
1847
1849
|
|
1848
1850
|
enum llama_pooling_type pooling_type = LLAMA_POOLING_TYPE_NONE;
|
1849
1851
|
enum llama_rope_type rope_type = LLAMA_ROPE_TYPE_NONE;
|
@@ -1933,6 +1935,7 @@ struct llama_cparams {
|
|
1933
1935
|
bool embeddings;
|
1934
1936
|
bool causal_attn;
|
1935
1937
|
bool offload_kqv;
|
1938
|
+
bool flash_attn;
|
1936
1939
|
|
1937
1940
|
enum llama_pooling_type pooling_type;
|
1938
1941
|
|
@@ -2036,8 +2039,8 @@ struct llama_kv_cache {
|
|
2036
2039
|
bool has_shift = false;
|
2037
2040
|
bool do_defrag = false;
|
2038
2041
|
bool do_copy = false;
|
2039
|
-
// with recurrent state models, a cell can hold the state for more than one past token
|
2040
|
-
bool
|
2042
|
+
bool recurrent = false; // with recurrent state models, a cell can hold the state for more than one past token
|
2043
|
+
bool v_trans = true; // the value tensor is transposed
|
2041
2044
|
|
2042
2045
|
// Note: The value of head isn't only used to optimize searching
|
2043
2046
|
// for a free KV slot. llama_decode_internal also uses it, so it
|
@@ -2114,7 +2117,8 @@ struct llama_vocab {
|
|
2114
2117
|
ttype type;
|
2115
2118
|
};
|
2116
2119
|
|
2117
|
-
enum llama_vocab_type
|
2120
|
+
enum llama_vocab_type type = LLAMA_VOCAB_TYPE_SPM;
|
2121
|
+
enum llama_vocab_pre_type type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
|
2118
2122
|
|
2119
2123
|
std::unordered_map<token, id> token_to_id;
|
2120
2124
|
std::vector<token_data> id_to_token;
|
@@ -2335,11 +2339,14 @@ struct llama_context {
|
|
2335
2339
|
|
2336
2340
|
static bool llama_kv_cache_init(
|
2337
2341
|
struct llama_kv_cache & cache,
|
2338
|
-
|
2342
|
+
const llama_context * ctx,
|
2339
2343
|
ggml_type type_k,
|
2340
2344
|
ggml_type type_v,
|
2341
2345
|
uint32_t kv_size,
|
2342
2346
|
bool offload) {
|
2347
|
+
const llama_model & model = ctx->model;
|
2348
|
+
const llama_cparams & cparams = ctx->cparams;
|
2349
|
+
|
2343
2350
|
const struct llama_hparams & hparams = model.hparams;
|
2344
2351
|
|
2345
2352
|
const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa() + hparams.n_embd_k_s();
|
@@ -2350,8 +2357,9 @@ static bool llama_kv_cache_init(
|
|
2350
2357
|
|
2351
2358
|
// TODO: find a nicer way to add other recurrent model architectures
|
2352
2359
|
cache.recurrent = model.arch == LLM_ARCH_MAMBA;
|
2360
|
+
cache.v_trans = !cparams.flash_attn;
|
2353
2361
|
|
2354
|
-
// TODO: support mixed
|
2362
|
+
// TODO: support mixed recurrent Transformer architectures
|
2355
2363
|
// NOTE: (!a || b) is a logical implication (a -> b)
|
2356
2364
|
GGML_ASSERT(!cache.recurrent || n_embd_k_gqa == hparams.n_embd_k_s());
|
2357
2365
|
GGML_ASSERT(!cache.recurrent || n_embd_v_gqa == hparams.n_embd_v_s());
|
@@ -2562,6 +2570,10 @@ static void llama_kv_cache_clear(struct llama_kv_cache & cache) {
|
|
2562
2570
|
}
|
2563
2571
|
cache.head = 0;
|
2564
2572
|
cache.used = 0;
|
2573
|
+
|
2574
|
+
for (auto & buf : cache.bufs) {
|
2575
|
+
ggml_backend_buffer_clear(buf, 0);
|
2576
|
+
}
|
2565
2577
|
}
|
2566
2578
|
|
2567
2579
|
static bool llama_kv_cache_seq_rm(
|
@@ -2882,6 +2894,7 @@ namespace GGUFMeta {
|
|
2882
2894
|
case LLAMA_KV_OVERRIDE_TYPE_BOOL: return "bool";
|
2883
2895
|
case LLAMA_KV_OVERRIDE_TYPE_INT: return "int";
|
2884
2896
|
case LLAMA_KV_OVERRIDE_TYPE_FLOAT: return "float";
|
2897
|
+
case LLAMA_KV_OVERRIDE_TYPE_STR: return "str";
|
2885
2898
|
}
|
2886
2899
|
return "unknown";
|
2887
2900
|
}
|
@@ -2893,13 +2906,16 @@ namespace GGUFMeta {
|
|
2893
2906
|
__func__, override_type_to_str(ovrd->tag), ovrd->key);
|
2894
2907
|
switch (ovrd->tag) {
|
2895
2908
|
case LLAMA_KV_OVERRIDE_TYPE_BOOL: {
|
2896
|
-
LLAMA_LOG_INFO("%s\n", ovrd->
|
2909
|
+
LLAMA_LOG_INFO("%s\n", ovrd->val_bool ? "true" : "false");
|
2897
2910
|
} break;
|
2898
2911
|
case LLAMA_KV_OVERRIDE_TYPE_INT: {
|
2899
|
-
LLAMA_LOG_INFO("%" PRId64 "\n", ovrd->
|
2912
|
+
LLAMA_LOG_INFO("%" PRId64 "\n", ovrd->val_i64);
|
2900
2913
|
} break;
|
2901
2914
|
case LLAMA_KV_OVERRIDE_TYPE_FLOAT: {
|
2902
|
-
LLAMA_LOG_INFO("%.6f\n", ovrd->
|
2915
|
+
LLAMA_LOG_INFO("%.6f\n", ovrd->val_f64);
|
2916
|
+
} break;
|
2917
|
+
case LLAMA_KV_OVERRIDE_TYPE_STR: {
|
2918
|
+
LLAMA_LOG_INFO("%s\n", ovrd->val_str);
|
2903
2919
|
} break;
|
2904
2920
|
default:
|
2905
2921
|
// Shouldn't be possible to end up here, but just in case...
|
@@ -2918,7 +2934,7 @@ namespace GGUFMeta {
|
|
2918
2934
|
static typename std::enable_if<std::is_same<OT, bool>::value, bool>::type
|
2919
2935
|
try_override(OT & target, const struct llama_model_kv_override * ovrd) {
|
2920
2936
|
if (validate_override(LLAMA_KV_OVERRIDE_TYPE_BOOL, ovrd)) {
|
2921
|
-
target = ovrd->
|
2937
|
+
target = ovrd->val_bool;
|
2922
2938
|
return true;
|
2923
2939
|
}
|
2924
2940
|
return false;
|
@@ -2928,7 +2944,7 @@ namespace GGUFMeta {
|
|
2928
2944
|
static typename std::enable_if<!std::is_same<OT, bool>::value && std::is_integral<OT>::value, bool>::type
|
2929
2945
|
try_override(OT & target, const struct llama_model_kv_override * ovrd) {
|
2930
2946
|
if (validate_override(LLAMA_KV_OVERRIDE_TYPE_INT, ovrd)) {
|
2931
|
-
target = ovrd->
|
2947
|
+
target = ovrd->val_i64;
|
2932
2948
|
return true;
|
2933
2949
|
}
|
2934
2950
|
return false;
|
@@ -2938,7 +2954,7 @@ namespace GGUFMeta {
|
|
2938
2954
|
static typename std::enable_if<std::is_floating_point<OT>::value, bool>::type
|
2939
2955
|
try_override(T & target, const struct llama_model_kv_override * ovrd) {
|
2940
2956
|
if (validate_override(LLAMA_KV_OVERRIDE_TYPE_FLOAT, ovrd)) {
|
2941
|
-
target = ovrd->
|
2957
|
+
target = ovrd->val_f64;
|
2942
2958
|
return true;
|
2943
2959
|
}
|
2944
2960
|
return false;
|
@@ -2947,12 +2963,11 @@ namespace GGUFMeta {
|
|
2947
2963
|
template<typename OT>
|
2948
2964
|
static typename std::enable_if<std::is_same<OT, std::string>::value, bool>::type
|
2949
2965
|
try_override(T & target, const struct llama_model_kv_override * ovrd) {
|
2950
|
-
(
|
2951
|
-
|
2952
|
-
|
2953
|
-
|
2954
|
-
|
2955
|
-
ovrd ? ovrd->key : "NULL"));
|
2966
|
+
if (validate_override(LLAMA_KV_OVERRIDE_TYPE_STR, ovrd)) {
|
2967
|
+
target = ovrd->val_str;
|
2968
|
+
return true;
|
2969
|
+
}
|
2970
|
+
return false;
|
2956
2971
|
}
|
2957
2972
|
|
2958
2973
|
static bool set(const gguf_context * ctx, const int k, T & target, const struct llama_model_kv_override * ovrd = nullptr) {
|
@@ -2985,6 +3000,7 @@ struct llama_model_loader {
|
|
2985
3000
|
size_t n_bytes = 0;
|
2986
3001
|
|
2987
3002
|
bool use_mmap = false;
|
3003
|
+
bool check_tensors;
|
2988
3004
|
|
2989
3005
|
llama_files files;
|
2990
3006
|
llama_ftype ftype;
|
@@ -3018,7 +3034,7 @@ struct llama_model_loader {
|
|
3018
3034
|
std::string arch_name;
|
3019
3035
|
LLM_KV llm_kv = LLM_KV(LLM_ARCH_UNKNOWN);
|
3020
3036
|
|
3021
|
-
llama_model_loader(const std::string & fname, bool use_mmap, const struct llama_model_kv_override * param_overrides_p) {
|
3037
|
+
llama_model_loader(const std::string & fname, bool use_mmap, bool check_tensors, const struct llama_model_kv_override * param_overrides_p) {
|
3022
3038
|
int trace = 0;
|
3023
3039
|
if (getenv("LLAMA_TRACE")) {
|
3024
3040
|
trace = atoi(getenv("LLAMA_TRACE"));
|
@@ -3115,9 +3131,17 @@ struct llama_model_loader {
|
|
3115
3131
|
|
3116
3132
|
fver = (enum llama_fver) gguf_get_version(meta);
|
3117
3133
|
|
3134
|
+
std::set<std::string> tensor_names;
|
3118
3135
|
for (auto & w : weights) {
|
3119
3136
|
n_elements += ggml_nelements(w.tensor);
|
3120
3137
|
n_bytes += ggml_nbytes(w.tensor);
|
3138
|
+
// make sure there is no duplicated tensor names
|
3139
|
+
const std::string name(w.tensor->name);
|
3140
|
+
auto found = tensor_names.find(name);
|
3141
|
+
if (found != tensor_names.end()) {
|
3142
|
+
throw std::runtime_error(format("invalid model: tensor '%s' is duplicated", w.tensor->name));
|
3143
|
+
}
|
3144
|
+
tensor_names.insert(name);
|
3121
3145
|
}
|
3122
3146
|
|
3123
3147
|
LLAMA_LOG_INFO("%s: loaded meta data with %d key-value pairs and %d tensors from %s (version %s)\n",
|
@@ -3151,6 +3175,7 @@ struct llama_model_loader {
|
|
3151
3175
|
switch (type_max) {
|
3152
3176
|
case GGML_TYPE_F32: ftype = LLAMA_FTYPE_ALL_F32; break;
|
3153
3177
|
case GGML_TYPE_F16: ftype = LLAMA_FTYPE_MOSTLY_F16; break;
|
3178
|
+
case GGML_TYPE_BF16: ftype = LLAMA_FTYPE_MOSTLY_BF16; break;
|
3154
3179
|
case GGML_TYPE_Q4_0: ftype = LLAMA_FTYPE_MOSTLY_Q4_0; break;
|
3155
3180
|
case GGML_TYPE_Q4_1: ftype = LLAMA_FTYPE_MOSTLY_Q4_1; break;
|
3156
3181
|
case GGML_TYPE_Q5_0: ftype = LLAMA_FTYPE_MOSTLY_Q5_0; break;
|
@@ -3223,6 +3248,7 @@ struct llama_model_loader {
|
|
3223
3248
|
}
|
3224
3249
|
|
3225
3250
|
this->use_mmap = use_mmap;
|
3251
|
+
this->check_tensors = check_tensors;
|
3226
3252
|
}
|
3227
3253
|
|
3228
3254
|
~llama_model_loader() {
|
@@ -3481,6 +3507,10 @@ struct llama_model_loader {
|
|
3481
3507
|
file->seek(w.offs, SEEK_SET);
|
3482
3508
|
file->read_raw(cur->data, ggml_nbytes(cur));
|
3483
3509
|
}
|
3510
|
+
|
3511
|
+
if (check_tensors && !ggml_validate_row_data(cur->type, cur->data, ggml_nbytes(cur))) {
|
3512
|
+
throw std::runtime_error(format("tensor '%s' has invalid data", ggml_get_name(cur)));
|
3513
|
+
}
|
3484
3514
|
}
|
3485
3515
|
|
3486
3516
|
size_t size_done = 0;
|
@@ -3497,6 +3527,8 @@ struct llama_model_loader {
|
|
3497
3527
|
GGML_ASSERT(size_data != 0 && "call init_mappings() first");
|
3498
3528
|
|
3499
3529
|
std::vector<no_init<uint8_t>> read_buf;
|
3530
|
+
std::vector<std::future<std::pair<ggml_tensor *, bool>>> validation_result;
|
3531
|
+
|
3500
3532
|
for (struct ggml_tensor * cur = ggml_get_first_tensor(ctx); cur != NULL; cur = ggml_get_next_tensor(ctx, cur)) {
|
3501
3533
|
const auto * weight = get_weight(ggml_get_name(cur));
|
3502
3534
|
if (weight == nullptr) {
|
@@ -3518,37 +3550,66 @@ struct llama_model_loader {
|
|
3518
3550
|
if (bufs_mmap.count(weight->idx)) {
|
3519
3551
|
buf_mmap = bufs_mmap.at(weight->idx);
|
3520
3552
|
}
|
3553
|
+
uint8_t * data = (uint8_t *) mapping->addr + weight->offs;
|
3554
|
+
|
3555
|
+
if (check_tensors) {
|
3556
|
+
validation_result.emplace_back(std::async(std::launch::async, [cur, data, n_size] {
|
3557
|
+
return std::make_pair(cur, ggml_validate_row_data(cur->type, data, n_size));
|
3558
|
+
}));
|
3559
|
+
}
|
3560
|
+
|
3521
3561
|
GGML_ASSERT(buf_mmap || cur->data); // either we have a buffer to allocate the tensor in, or it is already allocated
|
3522
3562
|
if (buf_mmap && cur->data == nullptr) {
|
3523
|
-
ggml_backend_tensor_alloc(buf_mmap, cur,
|
3563
|
+
ggml_backend_tensor_alloc(buf_mmap, cur, data);
|
3524
3564
|
if (lmlocks) {
|
3525
3565
|
const auto & lmlock = lmlocks->at(weight->idx);
|
3526
|
-
lmlock->grow_to(weight->offs +
|
3566
|
+
lmlock->grow_to(weight->offs + n_size);
|
3527
3567
|
}
|
3528
3568
|
|
3529
3569
|
auto & mmap_used = mmaps_used[weight->idx];
|
3530
3570
|
mmap_used.first = std::min(mmap_used.first, weight->offs);
|
3531
3571
|
mmap_used.second = std::max(mmap_used.second, weight->offs + n_size);
|
3532
3572
|
} else {
|
3533
|
-
ggml_backend_tensor_set(cur,
|
3573
|
+
ggml_backend_tensor_set(cur, data, 0, n_size);
|
3534
3574
|
}
|
3535
3575
|
} else {
|
3536
3576
|
GGML_ASSERT(weight->idx < files.size());
|
3537
3577
|
const auto & file = files.at(weight->idx);
|
3538
3578
|
if (ggml_backend_buffer_is_host(cur->buffer)) {
|
3539
3579
|
file->seek(weight->offs, SEEK_SET);
|
3540
|
-
file->read_raw(cur->data,
|
3580
|
+
file->read_raw(cur->data, n_size);
|
3581
|
+
if (check_tensors) {
|
3582
|
+
validation_result.emplace_back(std::async(std::launch::async, [cur, n_size] {
|
3583
|
+
return std::make_pair(cur, ggml_validate_row_data(cur->type, cur->data, n_size));
|
3584
|
+
}));
|
3585
|
+
}
|
3541
3586
|
} else {
|
3542
|
-
read_buf.resize(
|
3587
|
+
read_buf.resize(n_size);
|
3543
3588
|
file->seek(weight->offs, SEEK_SET);
|
3544
|
-
file->read_raw(read_buf.data(),
|
3589
|
+
file->read_raw(read_buf.data(), n_size);
|
3545
3590
|
ggml_backend_tensor_set(cur, read_buf.data(), 0, n_size);
|
3591
|
+
if (check_tensors && !ggml_validate_row_data(cur->type, read_buf.data(), n_size)) {
|
3592
|
+
throw std::runtime_error(format("tensor '%s' has invalid data", ggml_get_name(cur)));
|
3593
|
+
}
|
3546
3594
|
}
|
3547
3595
|
}
|
3548
3596
|
|
3549
3597
|
size_done += n_size;
|
3550
3598
|
}
|
3551
3599
|
|
3600
|
+
// check validation results
|
3601
|
+
bool validation_failed = false;
|
3602
|
+
for (auto & future : validation_result) {
|
3603
|
+
auto result = future.get();
|
3604
|
+
if (!result.second) {
|
3605
|
+
LLAMA_LOG_ERROR("%s: tensor '%s' has invalid data\n", __func__, ggml_get_name(result.first));
|
3606
|
+
validation_failed = true;
|
3607
|
+
}
|
3608
|
+
}
|
3609
|
+
if (validation_failed) {
|
3610
|
+
throw std::runtime_error("found tensors with invalid data");
|
3611
|
+
}
|
3612
|
+
|
3552
3613
|
// check if this is the last call and do final cleanup
|
3553
3614
|
if (size_done >= size_data) {
|
3554
3615
|
// unmap offloaded tensors and metadata
|
@@ -3606,6 +3667,7 @@ static std::string llama_model_ftype_name(llama_ftype ftype) {
|
|
3606
3667
|
switch (ftype) {
|
3607
3668
|
case LLAMA_FTYPE_ALL_F32: return "all F32";
|
3608
3669
|
case LLAMA_FTYPE_MOSTLY_F16: return "F16";
|
3670
|
+
case LLAMA_FTYPE_MOSTLY_BF16: return "BF16";
|
3609
3671
|
case LLAMA_FTYPE_MOSTLY_Q4_0: return "Q4_0";
|
3610
3672
|
case LLAMA_FTYPE_MOSTLY_Q4_1: return "Q4_1";
|
3611
3673
|
case LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16:
|
@@ -4142,7 +4204,7 @@ static void llm_load_hparams(
|
|
4142
4204
|
model.ftype = ml.ftype;
|
4143
4205
|
|
4144
4206
|
if (hparams.f_max_alibi_bias > 0.0f) {
|
4145
|
-
hparams.
|
4207
|
+
hparams.use_alibi = true;
|
4146
4208
|
}
|
4147
4209
|
|
4148
4210
|
hparams.rope_type = llama_rope_type(&model);
|
@@ -4165,11 +4227,13 @@ static void llm_load_vocab(
|
|
4165
4227
|
|
4166
4228
|
// determine vocab type
|
4167
4229
|
{
|
4168
|
-
std::string
|
4230
|
+
std::string tokenizer_model;
|
4231
|
+
std::string tokenizer_pre;
|
4169
4232
|
|
4170
|
-
ml.get_key(LLM_KV_TOKENIZER_MODEL,
|
4233
|
+
ml.get_key(LLM_KV_TOKENIZER_MODEL, tokenizer_model);
|
4234
|
+
ml.get_key(LLM_KV_TOKENIZER_PRE, tokenizer_pre, false);
|
4171
4235
|
|
4172
|
-
if (
|
4236
|
+
if (tokenizer_model == "no_vocab") {
|
4173
4237
|
vocab.type = LLAMA_VOCAB_TYPE_NONE;
|
4174
4238
|
|
4175
4239
|
// default special tokens
|
@@ -4183,7 +4247,7 @@ static void llm_load_vocab(
|
|
4183
4247
|
vocab.linefeed_id = -1;
|
4184
4248
|
|
4185
4249
|
return;
|
4186
|
-
} else if (
|
4250
|
+
} else if (tokenizer_model == "llama") {
|
4187
4251
|
vocab.type = LLAMA_VOCAB_TYPE_SPM;
|
4188
4252
|
|
4189
4253
|
// default special tokens
|
@@ -4228,9 +4292,27 @@ static void llm_load_vocab(
|
|
4228
4292
|
if (add_space_prefix_keyidx != -1) {
|
4229
4293
|
vocab.add_space_prefix = gguf_get_val_bool(ctx, add_space_prefix_keyidx);
|
4230
4294
|
} // The default value of add_space_prefix is true.
|
4231
|
-
} else if (
|
4232
|
-
vocab.type =
|
4295
|
+
} else if (tokenizer_model == "bert") {
|
4296
|
+
vocab.type = LLAMA_VOCAB_TYPE_WPM;
|
4233
4297
|
|
4298
|
+
// default special tokens
|
4299
|
+
vocab.special_bos_id = -1;
|
4300
|
+
vocab.special_eos_id = -1;
|
4301
|
+
vocab.special_unk_id = 100;
|
4302
|
+
vocab.special_sep_id = 102;
|
4303
|
+
vocab.special_pad_id = 0;
|
4304
|
+
vocab.special_cls_id = 101;
|
4305
|
+
vocab.special_mask_id = 103;
|
4306
|
+
vocab.add_space_prefix = false;
|
4307
|
+
} else {
|
4308
|
+
if (tokenizer_model == "gpt2") {
|
4309
|
+
vocab.type = LLAMA_VOCAB_TYPE_BPE;
|
4310
|
+
} else {
|
4311
|
+
LLAMA_LOG_WARN("%s: unknown tokenizer: '%s'", __func__, tokenizer_model.c_str());
|
4312
|
+
LLAMA_LOG_WARN("%s: using default tokenizer: 'llama'", __func__);
|
4313
|
+
vocab.type = LLAMA_VOCAB_TYPE_SPM;
|
4314
|
+
return;
|
4315
|
+
}
|
4234
4316
|
// read bpe merges and populate bpe ranks
|
4235
4317
|
const int merges_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_MERGES).c_str());
|
4236
4318
|
if (merges_keyidx == -1) {
|
@@ -4264,23 +4346,65 @@ static void llm_load_vocab(
|
|
4264
4346
|
vocab.special_pad_id = -1;
|
4265
4347
|
vocab.special_cls_id = -1;
|
4266
4348
|
vocab.special_mask_id = -1;
|
4267
|
-
}
|
4268
|
-
vocab.type = LLAMA_VOCAB_TYPE_WPM;
|
4349
|
+
}
|
4269
4350
|
|
4270
|
-
|
4271
|
-
|
4272
|
-
|
4273
|
-
|
4274
|
-
|
4275
|
-
|
4276
|
-
|
4277
|
-
|
4278
|
-
|
4351
|
+
// for now, only BPE models have pre-tokenizers
|
4352
|
+
if (vocab.type == LLAMA_VOCAB_TYPE_BPE) {
|
4353
|
+
if (tokenizer_pre.empty()) {
|
4354
|
+
LLAMA_LOG_WARN("%s: missing pre-tokenizer type, using: 'default'\n", __func__);
|
4355
|
+
LLAMA_LOG_WARN("%s: \n", __func__);
|
4356
|
+
LLAMA_LOG_WARN("%s: ************************************ \n", __func__);
|
4357
|
+
LLAMA_LOG_WARN("%s: GENERATION QUALITY WILL BE DEGRADED! \n", __func__);
|
4358
|
+
LLAMA_LOG_WARN("%s: CONSIDER REGENERATING THE MODEL \n", __func__);
|
4359
|
+
LLAMA_LOG_WARN("%s: ************************************ \n", __func__);
|
4360
|
+
LLAMA_LOG_WARN("%s: \n", __func__);
|
4361
|
+
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
|
4362
|
+
} else if (
|
4363
|
+
tokenizer_pre == "default") {
|
4364
|
+
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
|
4365
|
+
} else if (
|
4366
|
+
tokenizer_pre == "llama3" ||
|
4367
|
+
tokenizer_pre == "llama-v3" ||
|
4368
|
+
tokenizer_pre == "llama-bpe") {
|
4369
|
+
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_LLAMA3;
|
4370
|
+
} else if (
|
4371
|
+
tokenizer_pre == "deepseek-llm") {
|
4372
|
+
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM;
|
4373
|
+
} else if (
|
4374
|
+
tokenizer_pre == "deepseek-coder") {
|
4375
|
+
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER;
|
4376
|
+
} else if (
|
4377
|
+
tokenizer_pre == "falcon") {
|
4378
|
+
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_FALCON;
|
4379
|
+
} else if (
|
4380
|
+
tokenizer_pre == "mpt") {
|
4381
|
+
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_MPT;
|
4382
|
+
} else if (
|
4383
|
+
tokenizer_pre == "starcoder") {
|
4384
|
+
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_STARCODER;
|
4385
|
+
} else if (
|
4386
|
+
tokenizer_pre == "gpt-2") {
|
4387
|
+
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_GPT2;
|
4388
|
+
} else if (
|
4389
|
+
tokenizer_pre == "refact") {
|
4390
|
+
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_REFACT;
|
4391
|
+
} else if (
|
4392
|
+
tokenizer_pre == "command-r") {
|
4393
|
+
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_COMMAND_R;
|
4394
|
+
} else if (
|
4395
|
+
tokenizer_pre == "qwen2") {
|
4396
|
+
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_QWEN2;
|
4397
|
+
} else if (
|
4398
|
+
tokenizer_pre == "olmo") {
|
4399
|
+
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_OLMO;
|
4400
|
+
} else if (
|
4401
|
+
tokenizer_pre == "dbrx") {
|
4402
|
+
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DBRX;
|
4403
|
+
} else {
|
4404
|
+
throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
|
4405
|
+
}
|
4279
4406
|
} else {
|
4280
|
-
|
4281
|
-
LLAMA_LOG_WARN("%s: using default tokenizer: 'llama'", __func__);
|
4282
|
-
|
4283
|
-
vocab.type = LLAMA_VOCAB_TYPE_SPM;
|
4407
|
+
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
|
4284
4408
|
}
|
4285
4409
|
}
|
4286
4410
|
|
@@ -5975,7 +6099,7 @@ static bool llm_load_tensors(
|
|
5975
6099
|
// Returns 0 on success, -1 on error, and -2 on cancellation via llama_progress_callback
|
5976
6100
|
static int llama_model_load(const std::string & fname, llama_model & model, llama_model_params & params) {
|
5977
6101
|
try {
|
5978
|
-
llama_model_loader ml(fname, params.use_mmap, params.kv_overrides);
|
6102
|
+
llama_model_loader ml(fname, params.use_mmap, params.check_tensors, params.kv_overrides);
|
5979
6103
|
|
5980
6104
|
model.hparams.vocab_only = params.vocab_only;
|
5981
6105
|
|
@@ -6013,6 +6137,7 @@ static int llama_model_load(const std::string & fname, llama_model & model, llam
|
|
6013
6137
|
|| !(
|
6014
6138
|
model.ftype == LLAMA_FTYPE_ALL_F32 ||
|
6015
6139
|
model.ftype == LLAMA_FTYPE_MOSTLY_F16 ||
|
6140
|
+
model.ftype == LLAMA_FTYPE_MOSTLY_BF16 ||
|
6016
6141
|
model.ftype == LLAMA_FTYPE_MOSTLY_Q4_0 ||
|
6017
6142
|
model.ftype == LLAMA_FTYPE_MOSTLY_Q4_1
|
6018
6143
|
)
|
@@ -6104,37 +6229,47 @@ static struct ggml_tensor * llm_build_inp_embd(
|
|
6104
6229
|
static void llm_build_kv_store(
|
6105
6230
|
struct ggml_context * ctx,
|
6106
6231
|
const llama_hparams & hparams,
|
6232
|
+
const llama_cparams & cparams,
|
6107
6233
|
const llama_kv_cache & kv,
|
6108
6234
|
struct ggml_cgraph * graph,
|
6109
6235
|
struct ggml_tensor * k_cur,
|
6110
6236
|
struct ggml_tensor * v_cur,
|
6111
|
-
int64_t n_ctx,
|
6112
6237
|
int32_t n_tokens,
|
6113
6238
|
int32_t kv_head,
|
6114
6239
|
const llm_build_cb & cb,
|
6115
6240
|
int64_t il) {
|
6241
|
+
const int64_t n_ctx = cparams.n_ctx;
|
6242
|
+
|
6116
6243
|
const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa();
|
6117
6244
|
const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa();
|
6118
6245
|
|
6119
6246
|
GGML_ASSERT(kv.size == n_ctx);
|
6120
6247
|
|
6121
|
-
// compute the transposed [n_tokens, n_embd] V matrix
|
6122
|
-
assert(v_cur->ne[0] == n_embd_v_gqa && v_cur->ne[1] == n_tokens);
|
6123
|
-
struct ggml_tensor * v_cur_t = ggml_transpose(ctx, v_cur);
|
6124
|
-
cb(v_cur_t, "v_cur_t", il);
|
6125
|
-
|
6126
6248
|
struct ggml_tensor * k_cache_view = ggml_view_1d(ctx, kv.k_l[il], n_tokens*n_embd_k_gqa,
|
6127
6249
|
(ggml_row_size(kv.k_l[il]->type, n_embd_k_gqa))*kv_head);
|
6128
6250
|
cb(k_cache_view, "k_cache_view", il);
|
6129
6251
|
|
6130
|
-
|
6131
|
-
|
6132
|
-
|
6252
|
+
// note: storing RoPE-ed version of K in the KV cache
|
6253
|
+
ggml_build_forward_expand(graph, ggml_cpy(ctx, k_cur, k_cache_view));
|
6254
|
+
|
6255
|
+
assert(v_cur->ne[0] == n_embd_v_gqa && v_cur->ne[1] == n_tokens);
|
6256
|
+
|
6257
|
+
struct ggml_tensor * v_cache_view = nullptr;
|
6258
|
+
|
6259
|
+
if (cparams.flash_attn) {
|
6260
|
+
v_cache_view = ggml_view_1d(ctx, kv.v_l[il], n_tokens*n_embd_v_gqa,
|
6261
|
+
(kv_head)*ggml_row_size(kv.v_l[il]->type, n_embd_v_gqa));
|
6262
|
+
} else {
|
6263
|
+
// note: the V cache is transposed when not using flash attention
|
6264
|
+
v_cache_view = ggml_view_2d(ctx, kv.v_l[il], n_tokens, n_embd_v_gqa,
|
6265
|
+
( n_ctx)*ggml_element_size(kv.v_l[il]),
|
6266
|
+
(kv_head)*ggml_element_size(kv.v_l[il]));
|
6267
|
+
|
6268
|
+
v_cur = ggml_transpose(ctx, v_cur);
|
6269
|
+
}
|
6133
6270
|
cb(v_cache_view, "v_cache_view", il);
|
6134
6271
|
|
6135
|
-
|
6136
|
-
ggml_build_forward_expand(graph, ggml_cpy(ctx, k_cur, k_cache_view));
|
6137
|
-
ggml_build_forward_expand(graph, ggml_cpy(ctx, v_cur_t, v_cache_view));
|
6272
|
+
ggml_build_forward_expand(graph, ggml_cpy(ctx, v_cur, v_cache_view));
|
6138
6273
|
}
|
6139
6274
|
|
6140
6275
|
static struct ggml_tensor * llm_build_norm(
|
@@ -6354,11 +6489,11 @@ static struct ggml_tensor * llm_build_moe_ffn(
|
|
6354
6489
|
return moe_out;
|
6355
6490
|
}
|
6356
6491
|
|
6357
|
-
// if max_alibi_bias > 0 then apply ALiBi
|
6358
6492
|
static struct ggml_tensor * llm_build_kqv(
|
6359
6493
|
struct ggml_context * ctx,
|
6360
6494
|
const llama_model & model,
|
6361
6495
|
const llama_hparams & hparams,
|
6496
|
+
const llama_cparams & cparams,
|
6362
6497
|
const llama_kv_cache & kv,
|
6363
6498
|
struct ggml_cgraph * graph,
|
6364
6499
|
struct ggml_tensor * wo,
|
@@ -6366,12 +6501,12 @@ static struct ggml_tensor * llm_build_kqv(
|
|
6366
6501
|
struct ggml_tensor * q_cur,
|
6367
6502
|
struct ggml_tensor * kq_mask,
|
6368
6503
|
struct ggml_tensor * kq_pos,
|
6369
|
-
int64_t n_ctx,
|
6370
6504
|
int32_t n_tokens,
|
6371
6505
|
int32_t n_kv,
|
6372
6506
|
float kq_scale,
|
6373
6507
|
const llm_build_cb & cb,
|
6374
6508
|
int il) {
|
6509
|
+
const int64_t n_ctx = cparams.n_ctx;
|
6375
6510
|
const int64_t n_head = hparams.n_head;
|
6376
6511
|
const int64_t n_head_kv = hparams.n_head_kv;
|
6377
6512
|
const int64_t n_embd_head_k = hparams.n_embd_head_k;
|
@@ -6389,71 +6524,99 @@ static struct ggml_tensor * llm_build_kqv(
|
|
6389
6524
|
0);
|
6390
6525
|
cb(k, "k", il);
|
6391
6526
|
|
6392
|
-
struct ggml_tensor *
|
6393
|
-
|
6527
|
+
struct ggml_tensor * cur;
|
6528
|
+
|
6529
|
+
if (cparams.flash_attn) {
|
6530
|
+
GGML_UNUSED(model);
|
6531
|
+
GGML_UNUSED(n_ctx);
|
6394
6532
|
|
6395
|
-
|
6396
|
-
//
|
6397
|
-
|
6398
|
-
ggml_mul_mat_set_prec(kq, GGML_PREC_F32);
|
6399
|
-
}
|
6533
|
+
// note: if this assert triggers, then some check has failed earlier
|
6534
|
+
// the idea is to detect during context creation that ALiBi would be used and disable Flash Attention
|
6535
|
+
GGML_ASSERT(kq_pos == nullptr && "ALiBi is not yet supported with Flash Attention");
|
6400
6536
|
|
6401
|
-
|
6402
|
-
|
6403
|
-
|
6404
|
-
|
6405
|
-
|
6406
|
-
|
6537
|
+
// split cached v into n_head heads (not transposed)
|
6538
|
+
struct ggml_tensor * v =
|
6539
|
+
ggml_view_3d(ctx, kv.v_l[il],
|
6540
|
+
n_embd_head_v, n_kv, n_head_kv,
|
6541
|
+
ggml_row_size(kv.v_l[il]->type, n_embd_k_gqa),
|
6542
|
+
ggml_row_size(kv.v_l[il]->type, n_embd_head_k),
|
6543
|
+
0);
|
6544
|
+
cb(v, "v", il);
|
6407
6545
|
|
6408
|
-
|
6409
|
-
//ggml_mul_mat_set_prec(kq, GGML_PREC_F32);
|
6546
|
+
cur = ggml_flash_attn_ext(ctx, q, k, v, kq_mask, kq_scale);
|
6410
6547
|
|
6411
|
-
|
6412
|
-
|
6413
|
-
|
6548
|
+
if (model.arch == LLM_ARCH_PHI2 || model.arch == LLM_ARCH_PHI3) {
|
6549
|
+
ggml_flash_attn_ext_set_prec(cur, GGML_PREC_F32);
|
6550
|
+
}
|
6551
|
+
|
6552
|
+
cur = ggml_reshape_2d(ctx, cur, n_embd_head_k*n_head, n_tokens);
|
6553
|
+
} else {
|
6554
|
+
struct ggml_tensor * kq = ggml_mul_mat(ctx, k, q);
|
6555
|
+
cb(kq, "kq", il);
|
6556
|
+
|
6557
|
+
if (model.arch == LLM_ARCH_PHI2 || model.arch == LLM_ARCH_PHI3) {
|
6558
|
+
// for this arch, we need to perform the KQ multiplication with F32 precision, otherwise we get NaNs
|
6559
|
+
// ref: https://github.com/ggerganov/llama.cpp/pull/4490#issuecomment-1859055847
|
6560
|
+
ggml_mul_mat_set_prec(kq, GGML_PREC_F32);
|
6561
|
+
}
|
6562
|
+
|
6563
|
+
if (model.arch == LLM_ARCH_GROK) {
|
6564
|
+
// need to do the following:
|
6565
|
+
// multiply by attn_output_multiplyer of 0.08838834764831845
|
6566
|
+
// and then :
|
6567
|
+
// kq = 30 * tanh(kq / 30)
|
6568
|
+
// before the softmax below
|
6569
|
+
|
6570
|
+
//try from phi2
|
6571
|
+
//ggml_mul_mat_set_prec(kq, GGML_PREC_F32);
|
6572
|
+
|
6573
|
+
kq = ggml_tanh(ctx, ggml_scale(ctx, kq, 0.08838834764831845f/30.0f));
|
6574
|
+
kq = ggml_scale(ctx, kq, 30);
|
6575
|
+
}
|
6414
6576
|
|
6415
6577
|
#if defined(GGML_USE_KOMPUTE)
|
6416
6578
|
#pragma message("TODO: ALiBi support in ggml_soft_max_ext is not implemented for Kompute")
|
6417
6579
|
#pragma message(" Falling back to ggml_alibi(). Will become an error in Mar 2024")
|
6418
6580
|
#pragma message("ref: https://github.com/ggerganov/llama.cpp/pull/5488")
|
6419
|
-
|
6420
|
-
|
6421
|
-
|
6581
|
+
if (hparams.use_alibi) {
|
6582
|
+
kq = ggml_scale(ctx, kq, kq_scale);
|
6583
|
+
cb(kq, "kq_scaled", il);
|
6422
6584
|
|
6423
|
-
|
6424
|
-
|
6585
|
+
kq = ggml_alibi(ctx, kq, /*n_past*/ 0, n_head, hparams.f_max_alibi_bias);
|
6586
|
+
cb(kq, "kq_scaled_alibi", il);
|
6425
6587
|
|
6426
|
-
|
6427
|
-
|
6588
|
+
kq = ggml_add(ctx, kq, kq_mask);
|
6589
|
+
cb(kq, "kq_masked", il);
|
6428
6590
|
|
6429
|
-
|
6430
|
-
|
6431
|
-
|
6591
|
+
kq = ggml_soft_max(ctx, kq);
|
6592
|
+
cb(kq, "kq_soft_max", il);
|
6593
|
+
} else
|
6432
6594
|
#endif
|
6433
|
-
|
6434
|
-
|
6435
|
-
|
6436
|
-
|
6595
|
+
{
|
6596
|
+
kq = ggml_soft_max_ext(ctx, kq, kq_mask, kq_pos, kq_scale, hparams.f_max_alibi_bias);
|
6597
|
+
cb(kq, "kq_soft_max_ext", il);
|
6598
|
+
}
|
6437
6599
|
|
6438
|
-
|
6600
|
+
GGML_ASSERT(kv.size == n_ctx);
|
6439
6601
|
|
6440
|
-
|
6441
|
-
|
6442
|
-
|
6443
|
-
|
6444
|
-
|
6445
|
-
|
6446
|
-
|
6447
|
-
|
6602
|
+
// split cached v into n_head heads
|
6603
|
+
struct ggml_tensor * v =
|
6604
|
+
ggml_view_3d(ctx, kv.v_l[il],
|
6605
|
+
n_kv, n_embd_head_v, n_head_kv,
|
6606
|
+
ggml_element_size(kv.v_l[il])*n_ctx,
|
6607
|
+
ggml_element_size(kv.v_l[il])*n_ctx*n_embd_head_v,
|
6608
|
+
0);
|
6609
|
+
cb(v, "v", il);
|
6448
6610
|
|
6449
|
-
|
6450
|
-
|
6611
|
+
struct ggml_tensor * kqv = ggml_mul_mat(ctx, v, kq);
|
6612
|
+
cb(kqv, "kqv", il);
|
6451
6613
|
|
6452
|
-
|
6453
|
-
|
6614
|
+
struct ggml_tensor * kqv_merged = ggml_permute(ctx, kqv, 0, 2, 1, 3);
|
6615
|
+
cb(kqv_merged, "kqv_merged", il);
|
6454
6616
|
|
6455
|
-
|
6456
|
-
|
6617
|
+
cur = ggml_cont_2d(ctx, kqv_merged, n_embd_head_k*n_head, n_tokens);
|
6618
|
+
cb(cur, "kqv_merged_cont", il);
|
6619
|
+
}
|
6457
6620
|
|
6458
6621
|
ggml_build_forward_expand(graph, cur);
|
6459
6622
|
|
@@ -6473,6 +6636,7 @@ static struct ggml_tensor * llm_build_kv(
|
|
6473
6636
|
struct ggml_context * ctx,
|
6474
6637
|
const llama_model & model,
|
6475
6638
|
const llama_hparams & hparams,
|
6639
|
+
const llama_cparams & cparams,
|
6476
6640
|
const llama_kv_cache & kv,
|
6477
6641
|
struct ggml_cgraph * graph,
|
6478
6642
|
struct ggml_tensor * wo,
|
@@ -6482,7 +6646,6 @@ static struct ggml_tensor * llm_build_kv(
|
|
6482
6646
|
struct ggml_tensor * q_cur,
|
6483
6647
|
struct ggml_tensor * kq_mask,
|
6484
6648
|
struct ggml_tensor * kq_pos,
|
6485
|
-
int64_t n_ctx,
|
6486
6649
|
int32_t n_tokens,
|
6487
6650
|
int32_t kv_head,
|
6488
6651
|
int32_t n_kv,
|
@@ -6496,12 +6659,12 @@ static struct ggml_tensor * llm_build_kv(
|
|
6496
6659
|
ggml_build_forward_expand(graph, k_cur);
|
6497
6660
|
ggml_build_forward_expand(graph, v_cur);
|
6498
6661
|
|
6499
|
-
llm_build_kv_store(ctx, hparams, kv, graph, k_cur, v_cur,
|
6662
|
+
llm_build_kv_store(ctx, hparams, cparams, kv, graph, k_cur, v_cur, n_tokens, kv_head, cb, il);
|
6500
6663
|
|
6501
6664
|
struct ggml_tensor * cur;
|
6502
6665
|
|
6503
|
-
cur = llm_build_kqv(ctx, model, hparams, kv, graph, wo, wo_b,
|
6504
|
-
q_cur, kq_mask, kq_pos,
|
6666
|
+
cur = llm_build_kqv(ctx, model, hparams, cparams, kv, graph, wo, wo_b,
|
6667
|
+
q_cur, kq_mask, kq_pos, n_tokens, n_kv, kq_scale, cb, il);
|
6505
6668
|
cb(cur, "kqv_out", il);
|
6506
6669
|
|
6507
6670
|
return cur;
|
@@ -6543,6 +6706,8 @@ struct llm_build_context {
|
|
6543
6706
|
const int32_t kv_head; // index of where we store new KV data in the cache
|
6544
6707
|
const int32_t n_orig_ctx;
|
6545
6708
|
|
6709
|
+
const bool flash_attn;
|
6710
|
+
|
6546
6711
|
const enum llama_pooling_type pooling_type;
|
6547
6712
|
const enum llama_rope_type rope_type;
|
6548
6713
|
|
@@ -6589,6 +6754,7 @@ struct llm_build_context {
|
|
6589
6754
|
n_outputs (worst_case ? n_tokens : lctx.n_outputs),
|
6590
6755
|
kv_head (worst_case ? (kv_self.recurrent ? 0 : kv_self.size - n_tokens) : kv_self.head),
|
6591
6756
|
n_orig_ctx (cparams.n_yarn_orig_ctx),
|
6757
|
+
flash_attn (cparams.flash_attn),
|
6592
6758
|
pooling_type (cparams.pooling_type),
|
6593
6759
|
rope_type (hparams.rope_type),
|
6594
6760
|
cb (cb),
|
@@ -6703,15 +6869,31 @@ struct llm_build_context {
|
|
6703
6869
|
ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa),
|
6704
6870
|
ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*id));
|
6705
6871
|
|
6706
|
-
ggml_tensor * view_v_src
|
6707
|
-
|
6708
|
-
|
6709
|
-
|
6872
|
+
ggml_tensor * view_v_src;
|
6873
|
+
ggml_tensor * view_v_dst;
|
6874
|
+
|
6875
|
+
if (flash_attn) {
|
6876
|
+
// NOTE: the V cache is not transposed when using flash attention
|
6877
|
+
view_v_src = ggml_view_2d(ctx0, kv_self.v_l[il],
|
6878
|
+
n_embd_v_gqa, nm,
|
6879
|
+
ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa),
|
6880
|
+
ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa*i));
|
6710
6881
|
|
6711
|
-
|
6712
|
-
|
6713
|
-
|
6714
|
-
|
6882
|
+
view_v_dst = ggml_view_2d(ctx0, kv_self.v_l[il],
|
6883
|
+
n_embd_v_gqa, nm,
|
6884
|
+
ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa),
|
6885
|
+
ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa*id));
|
6886
|
+
} else {
|
6887
|
+
view_v_src = ggml_view_2d(ctx0, kv_self.v_l[il],
|
6888
|
+
nm, n_embd_v_gqa,
|
6889
|
+
ggml_row_size(kv_self.v_l[il]->type, kv_self.size),
|
6890
|
+
ggml_row_size(kv_self.v_l[il]->type, i));
|
6891
|
+
|
6892
|
+
view_v_dst = ggml_view_2d(ctx0, kv_self.v_l[il],
|
6893
|
+
nm, n_embd_v_gqa,
|
6894
|
+
ggml_row_size(kv_self.v_l[il]->type, kv_self.size),
|
6895
|
+
ggml_row_size(kv_self.v_l[il]->type, id));
|
6896
|
+
}
|
6715
6897
|
|
6716
6898
|
ggml_build_forward_expand(gf, ggml_cpy(ctx0, view_k_src, view_k_dst));
|
6717
6899
|
ggml_build_forward_expand(gf, ggml_cpy(ctx0, view_v_src, view_v_dst));
|
@@ -6741,20 +6923,26 @@ struct llm_build_context {
|
|
6741
6923
|
|
6742
6924
|
struct ggml_tensor * build_inp_KQ_mask(bool causal = true) {
|
6743
6925
|
if (causal) {
|
6744
|
-
lctx.inp_KQ_mask = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_kv,
|
6926
|
+
lctx.inp_KQ_mask = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_kv, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD));
|
6745
6927
|
} else {
|
6746
|
-
lctx.inp_KQ_mask = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_tokens, n_tokens);
|
6928
|
+
lctx.inp_KQ_mask = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_tokens, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD));
|
6747
6929
|
}
|
6748
6930
|
cb(lctx.inp_KQ_mask, "KQ_mask", -1);
|
6749
6931
|
ggml_set_input(lctx.inp_KQ_mask);
|
6750
|
-
return lctx.inp_KQ_mask;
|
6932
|
+
return flash_attn ? ggml_cast(ctx0, lctx.inp_KQ_mask, GGML_TYPE_F16) : lctx.inp_KQ_mask;
|
6751
6933
|
}
|
6752
6934
|
|
6753
|
-
struct ggml_tensor * build_inp_KQ_pos() {
|
6754
|
-
|
6935
|
+
struct ggml_tensor * build_inp_KQ_pos(bool causal = true) {
|
6936
|
+
if (causal) {
|
6937
|
+
lctx.inp_KQ_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, n_kv);
|
6938
|
+
} else {
|
6939
|
+
// TODO: this will be needed for ALiBi-based BERT models
|
6940
|
+
// https://github.com/ggerganov/llama.cpp/pull/6826
|
6941
|
+
lctx.inp_KQ_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, n_tokens);
|
6942
|
+
}
|
6755
6943
|
cb(lctx.inp_KQ_pos, "KQ_pos", -1);
|
6756
6944
|
ggml_set_input(lctx.inp_KQ_pos);
|
6757
|
-
return lctx.inp_KQ_pos;
|
6945
|
+
return flash_attn ? ggml_cast(ctx0, lctx.inp_KQ_pos, GGML_TYPE_F16) : lctx.inp_KQ_pos;
|
6758
6946
|
}
|
6759
6947
|
|
6760
6948
|
struct ggml_tensor * build_inp_mean() {
|
@@ -6860,9 +7048,9 @@ struct llm_build_context {
|
|
6860
7048
|
);
|
6861
7049
|
cb(Kcur, "Kcur", il);
|
6862
7050
|
|
6863
|
-
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
7051
|
+
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
6864
7052
|
model.layers[il].wo, model.layers[il].bo,
|
6865
|
-
Kcur, Vcur, Qcur, KQ_mask, nullptr,
|
7053
|
+
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
6866
7054
|
}
|
6867
7055
|
|
6868
7056
|
if (il == n_layer - 1) {
|
@@ -7000,9 +7188,9 @@ struct llm_build_context {
|
|
7000
7188
|
cb(Qcur, "Qcur", il);
|
7001
7189
|
cb(Kcur, "Kcur", il);
|
7002
7190
|
|
7003
|
-
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
7191
|
+
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
7004
7192
|
model.layers[il].wo, NULL,
|
7005
|
-
Kcur, Vcur, Qcur, KQ_mask, KQ_pos,
|
7193
|
+
Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
7006
7194
|
}
|
7007
7195
|
|
7008
7196
|
if (il == n_layer - 1) {
|
@@ -7107,9 +7295,9 @@ struct llm_build_context {
|
|
7107
7295
|
ext_factor, attn_factor, beta_fast, beta_slow
|
7108
7296
|
);
|
7109
7297
|
cb(Kcur, "Kcur", il);
|
7110
|
-
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
7298
|
+
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
7111
7299
|
model.layers[il].wo, NULL,
|
7112
|
-
Kcur, Vcur, Qcur, KQ_mask, KQ_pos,
|
7300
|
+
Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
7113
7301
|
}
|
7114
7302
|
|
7115
7303
|
if (il == n_layer - 1) {
|
@@ -7227,9 +7415,9 @@ struct llm_build_context {
|
|
7227
7415
|
);
|
7228
7416
|
cb(Kcur, "Kcur", il);
|
7229
7417
|
|
7230
|
-
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
7418
|
+
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
7231
7419
|
model.layers[il].wo, NULL,
|
7232
|
-
Kcur, Vcur, Qcur, KQ_mask, nullptr,
|
7420
|
+
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
7233
7421
|
}
|
7234
7422
|
|
7235
7423
|
if (il == n_layer - 1) {
|
@@ -7352,9 +7540,9 @@ struct llm_build_context {
|
|
7352
7540
|
);
|
7353
7541
|
cb(Kcur, "Kcur", il);
|
7354
7542
|
|
7355
|
-
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
7543
|
+
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
7356
7544
|
model.layers[il].wo, model.layers[il].bo,
|
7357
|
-
Kcur, Vcur, Qcur, KQ_mask, nullptr,
|
7545
|
+
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f, cb, il);
|
7358
7546
|
}
|
7359
7547
|
|
7360
7548
|
if (il == n_layer - 1) {
|
@@ -7504,9 +7692,9 @@ struct llm_build_context {
|
|
7504
7692
|
);
|
7505
7693
|
cb(Kcur, "Kcur", il);
|
7506
7694
|
|
7507
|
-
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
7508
|
-
|
7509
|
-
|
7695
|
+
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
7696
|
+
model.layers[il].wo, NULL,
|
7697
|
+
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
7510
7698
|
}
|
7511
7699
|
|
7512
7700
|
if (il == n_layer - 1) {
|
@@ -7616,9 +7804,9 @@ struct llm_build_context {
|
|
7616
7804
|
|
7617
7805
|
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
7618
7806
|
|
7619
|
-
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
7807
|
+
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
7620
7808
|
model.layers[il].wo, model.layers[il].bo,
|
7621
|
-
Kcur, Vcur, Qcur, KQ_mask, nullptr,
|
7809
|
+
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
7622
7810
|
}
|
7623
7811
|
|
7624
7812
|
if (il == n_layer - 1) {
|
@@ -7820,9 +8008,9 @@ struct llm_build_context {
|
|
7820
8008
|
);
|
7821
8009
|
cb(Vcur, "Vcur", il);
|
7822
8010
|
|
7823
|
-
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
8011
|
+
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
7824
8012
|
model.layers[il].wo, model.layers[il].bo,
|
7825
|
-
Kcur, Vcur, Q, KQ_mask, nullptr,
|
8013
|
+
Kcur, Vcur, Q, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
7826
8014
|
}
|
7827
8015
|
|
7828
8016
|
if (il == n_layer - 1) {
|
@@ -7916,9 +8104,9 @@ struct llm_build_context {
|
|
7916
8104
|
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
7917
8105
|
cb(Qcur, "Qcur", il);
|
7918
8106
|
|
7919
|
-
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
8107
|
+
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
7920
8108
|
model.layers[il].wo, NULL,
|
7921
|
-
Kcur, Vcur, Qcur, KQ_mask, KQ_pos,
|
8109
|
+
Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
7922
8110
|
}
|
7923
8111
|
|
7924
8112
|
if (il == n_layer - 1) {
|
@@ -8209,9 +8397,9 @@ struct llm_build_context {
|
|
8209
8397
|
|
8210
8398
|
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
8211
8399
|
|
8212
|
-
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
8400
|
+
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
8213
8401
|
model.layers[il].wo, model.layers[il].bo,
|
8214
|
-
Kcur, Vcur, Qcur, KQ_mask, KQ_pos,
|
8402
|
+
Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
8215
8403
|
}
|
8216
8404
|
|
8217
8405
|
if (il == n_layer - 1) {
|
@@ -8340,14 +8528,15 @@ struct llm_build_context {
|
|
8340
8528
|
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
8341
8529
|
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
8342
8530
|
|
8343
|
-
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
8344
|
-
|
8345
|
-
|
8531
|
+
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
8532
|
+
model.layers[il].wo, model.layers[il].bo,
|
8533
|
+
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
8346
8534
|
} else {
|
8347
8535
|
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
8348
|
-
|
8536
|
+
|
8537
|
+
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
8349
8538
|
model.layers[il].wo, model.layers[il].bo,
|
8350
|
-
Kcur, Vcur, Qcur, KQ_mask, KQ_pos,
|
8539
|
+
Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
8351
8540
|
}
|
8352
8541
|
}
|
8353
8542
|
|
@@ -8489,9 +8678,9 @@ struct llm_build_context {
|
|
8489
8678
|
);
|
8490
8679
|
cb(Kcur, "Kcur", il);
|
8491
8680
|
|
8492
|
-
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
8681
|
+
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
8493
8682
|
model.layers[il].wo, NULL,
|
8494
|
-
Kcur, Vcur, Qcur, KQ_mask, nullptr,
|
8683
|
+
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
8495
8684
|
}
|
8496
8685
|
|
8497
8686
|
if (il == n_layer - 1) {
|
@@ -8607,9 +8796,9 @@ struct llm_build_context {
|
|
8607
8796
|
);
|
8608
8797
|
cb(Kcur, "Kcur", il);
|
8609
8798
|
|
8610
|
-
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
8799
|
+
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
8611
8800
|
model.layers[il].wo, NULL,
|
8612
|
-
Kcur, Vcur, Qcur, KQ_mask, nullptr,
|
8801
|
+
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
8613
8802
|
}
|
8614
8803
|
|
8615
8804
|
if (il == n_layer - 1) {
|
@@ -8720,9 +8909,9 @@ struct llm_build_context {
|
|
8720
8909
|
);
|
8721
8910
|
cb(Kcur, "Kcur", il);
|
8722
8911
|
|
8723
|
-
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
8912
|
+
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
8724
8913
|
model.layers[il].wo, model.layers[il].bo,
|
8725
|
-
Kcur, Vcur, Qcur, KQ_mask, nullptr,
|
8914
|
+
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
8726
8915
|
}
|
8727
8916
|
|
8728
8917
|
if (il == n_layer - 1) {
|
@@ -8834,9 +9023,9 @@ struct llm_build_context {
|
|
8834
9023
|
);
|
8835
9024
|
cb(Kcur, "Kcur", il);
|
8836
9025
|
|
8837
|
-
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
9026
|
+
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
8838
9027
|
model.layers[il].wo, model.layers[il].bo,
|
8839
|
-
Kcur, Vcur, Qcur, KQ_mask, nullptr,
|
9028
|
+
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
8840
9029
|
}
|
8841
9030
|
|
8842
9031
|
if (il == n_layer - 1) {
|
@@ -8989,9 +9178,9 @@ struct llm_build_context {
|
|
8989
9178
|
);
|
8990
9179
|
cb(Kcur, "Kcur", il);
|
8991
9180
|
|
8992
|
-
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
9181
|
+
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
8993
9182
|
model.layers[il].wo, model.layers[il].bo,
|
8994
|
-
Kcur, Vcur, Qcur, KQ_mask, nullptr,
|
9183
|
+
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f, cb, il);
|
8995
9184
|
}
|
8996
9185
|
|
8997
9186
|
if (il == n_layer - 1) {
|
@@ -9106,9 +9295,9 @@ struct llm_build_context {
|
|
9106
9295
|
);
|
9107
9296
|
cb(Kcur, "Kcur", il);
|
9108
9297
|
|
9109
|
-
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
9110
|
-
|
9111
|
-
|
9298
|
+
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
9299
|
+
model.layers[il].wo, model.layers[il].bo,
|
9300
|
+
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f, cb, il);
|
9112
9301
|
}
|
9113
9302
|
|
9114
9303
|
if (il == n_layer - 1) {
|
@@ -9219,9 +9408,9 @@ struct llm_build_context {
|
|
9219
9408
|
ext_factor, attn_factor, beta_fast, beta_slow);
|
9220
9409
|
cb(Kcur, "Kcur", il);
|
9221
9410
|
|
9222
|
-
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
9411
|
+
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
9223
9412
|
model.layers[il].wo, NULL,
|
9224
|
-
Kcur, Vcur, Qcur, KQ_mask, nullptr,
|
9413
|
+
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
9225
9414
|
}
|
9226
9415
|
struct ggml_tensor * sa_out = cur;
|
9227
9416
|
|
@@ -9322,9 +9511,9 @@ struct llm_build_context {
|
|
9322
9511
|
|
9323
9512
|
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
9324
9513
|
|
9325
|
-
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
9514
|
+
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
9326
9515
|
model.layers[il].wo, model.layers[il].bo,
|
9327
|
-
Kcur, Vcur, Qcur, KQ_mask, nullptr,
|
9516
|
+
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
9328
9517
|
}
|
9329
9518
|
|
9330
9519
|
if (il == n_layer - 1) {
|
@@ -9429,9 +9618,9 @@ struct llm_build_context {
|
|
9429
9618
|
);
|
9430
9619
|
cb(Kcur, "Kcur", il);
|
9431
9620
|
|
9432
|
-
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
9621
|
+
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
9433
9622
|
model.layers[il].wo, model.layers[il].bo,
|
9434
|
-
Kcur, Vcur, Qcur, KQ_mask, nullptr,
|
9623
|
+
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
9435
9624
|
}
|
9436
9625
|
|
9437
9626
|
if (il == n_layer - 1) {
|
@@ -9545,9 +9734,9 @@ struct llm_build_context {
|
|
9545
9734
|
);
|
9546
9735
|
cb(Kcur, "Kcur", il);
|
9547
9736
|
|
9548
|
-
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
9737
|
+
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
9549
9738
|
model.layers[il].wo, NULL,
|
9550
|
-
Kcur, Vcur, Qcur, KQ_mask, nullptr,
|
9739
|
+
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
9551
9740
|
}
|
9552
9741
|
|
9553
9742
|
if (il == n_layer - 1) {
|
@@ -9662,9 +9851,9 @@ struct llm_build_context {
|
|
9662
9851
|
);
|
9663
9852
|
cb(Kcur, "Kcur", il);
|
9664
9853
|
|
9665
|
-
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
9854
|
+
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
9666
9855
|
model.layers[il].wo, model.layers[il].bo,
|
9667
|
-
Kcur, Vcur, Qcur, KQ_mask, nullptr,
|
9856
|
+
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
9668
9857
|
}
|
9669
9858
|
|
9670
9859
|
if (il == n_layer - 1) {
|
@@ -9792,9 +9981,9 @@ struct llm_build_context {
|
|
9792
9981
|
);
|
9793
9982
|
cb(Kcur, "Kcur", il);
|
9794
9983
|
|
9795
|
-
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
9984
|
+
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
9796
9985
|
model.layers[il].wo, model.layers[il].bo,
|
9797
|
-
Kcur, Vcur, Qcur, KQ_mask, nullptr,
|
9986
|
+
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
9798
9987
|
}
|
9799
9988
|
|
9800
9989
|
if (il == n_layer - 1) {
|
@@ -9913,9 +10102,9 @@ struct llm_build_context {
|
|
9913
10102
|
ext_factor, attn_factor, beta_fast, beta_slow);
|
9914
10103
|
cb(Kcur, "Kcur", il);
|
9915
10104
|
|
9916
|
-
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
10105
|
+
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
9917
10106
|
model.layers[il].wo, NULL,
|
9918
|
-
Kcur, Vcur, Qcur, KQ_mask, nullptr,
|
10107
|
+
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f, cb, il);
|
9919
10108
|
}
|
9920
10109
|
|
9921
10110
|
if (il == n_layer - 1) {
|
@@ -10032,9 +10221,9 @@ struct llm_build_context {
|
|
10032
10221
|
);
|
10033
10222
|
cb(Kcur, "Kcur", il);
|
10034
10223
|
|
10035
|
-
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
10224
|
+
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
10036
10225
|
model.layers[il].wo, model.layers[il].bo,
|
10037
|
-
Kcur, Vcur, Qcur, KQ_mask, nullptr,
|
10226
|
+
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
10038
10227
|
}
|
10039
10228
|
|
10040
10229
|
if (il == n_layer - 1) {
|
@@ -10322,9 +10511,9 @@ struct llm_build_context {
|
|
10322
10511
|
);
|
10323
10512
|
cb(Kcur, "Kcur", il);
|
10324
10513
|
|
10325
|
-
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
10514
|
+
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
10326
10515
|
model.layers[il].wo, model.layers[il].bo,
|
10327
|
-
Kcur, Vcur, Qcur, KQ_mask, nullptr,
|
10516
|
+
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
10328
10517
|
}
|
10329
10518
|
|
10330
10519
|
if (il == n_layer - 1) {
|
@@ -10453,9 +10642,9 @@ struct llm_build_context {
|
|
10453
10642
|
);
|
10454
10643
|
cb(Kcur, "Kcur", il);
|
10455
10644
|
|
10456
|
-
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
10645
|
+
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
10457
10646
|
model.layers[il].wo, nullptr,
|
10458
|
-
Kcur, Vcur, Qcur, KQ_mask, nullptr,
|
10647
|
+
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
10459
10648
|
}
|
10460
10649
|
|
10461
10650
|
if (il == n_layer - 1) {
|
@@ -10882,7 +11071,9 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
|
|
10882
11071
|
}
|
10883
11072
|
}
|
10884
11073
|
|
10885
|
-
|
11074
|
+
// ALiBi requires the KQ_pos tensor to provide the sequence position of each token in the batch
|
11075
|
+
// this allows to process multiple sequences in parallel with ALiBi-based models
|
11076
|
+
if (hparams.use_alibi) {
|
10886
11077
|
const int64_t n_kv = kv_self.n;
|
10887
11078
|
|
10888
11079
|
GGML_ASSERT(lctx.inp_KQ_pos);
|
@@ -11264,7 +11455,7 @@ static int llama_decode_internal(
|
|
11264
11455
|
// a heuristic, to avoid attending the full cache if it is not yet utilized
|
11265
11456
|
// after enough generations, the benefit from this heuristic disappears
|
11266
11457
|
// if we start defragmenting the cache, the benefit from this will be more important
|
11267
|
-
kv_self.n = std::min(kv_self.size, std::max(
|
11458
|
+
kv_self.n = std::min(kv_self.size, std::max(256u, GGML_PAD(llama_kv_cache_cell_max(kv_self), 256)));
|
11268
11459
|
//kv_self.n = llama_kv_cache_cell_max(kv_self);
|
11269
11460
|
}
|
11270
11461
|
}
|
@@ -11432,6 +11623,10 @@ static int llama_decode_internal(
|
|
11432
11623
|
}
|
11433
11624
|
}
|
11434
11625
|
|
11626
|
+
// Reset state for the next token before backend sync, to allow the CPU activities in the reset to
|
11627
|
+
// overlap with device computation.
|
11628
|
+
ggml_backend_sched_reset(lctx.sched);
|
11629
|
+
|
11435
11630
|
return 0;
|
11436
11631
|
}
|
11437
11632
|
|
@@ -11457,7 +11652,9 @@ static void llama_kv_cache_defrag_internal(struct llama_context & lctx) {
|
|
11457
11652
|
// each move requires 6*n_layer tensors (see build_defrag)
|
11458
11653
|
// - source view, destination view, copy operation
|
11459
11654
|
// - x2 for keys and values
|
11460
|
-
const uint32_t max_moves = LLAMA_MAX_NODES/(6*n_layer);
|
11655
|
+
//const uint32_t max_moves = LLAMA_MAX_NODES/(6*n_layer);
|
11656
|
+
// TODO: tmp fix https://github.com/ggerganov/llama.cpp/issues/6685#issuecomment-2057579516
|
11657
|
+
const uint32_t max_moves = (LLAMA_MAX_NODES - 2*n_layer)/(6*n_layer);
|
11461
11658
|
|
11462
11659
|
// determine which KV cells to move where
|
11463
11660
|
//
|
@@ -11773,7 +11970,7 @@ static bool llama_is_user_defined_token(const llama_vocab& vocab, llama_token id
|
|
11773
11970
|
static uint8_t llama_token_to_byte(const llama_vocab& vocab, llama_token id) {
|
11774
11971
|
GGML_ASSERT(llama_vocab_get_type(vocab) != LLAMA_VOCAB_TYPE_NONE);
|
11775
11972
|
GGML_ASSERT(llama_is_byte_token(vocab, id));
|
11776
|
-
const auto& token_data = vocab.id_to_token.at(id);
|
11973
|
+
const auto & token_data = vocab.id_to_token.at(id);
|
11777
11974
|
switch (llama_vocab_get_type(vocab)) {
|
11778
11975
|
case LLAMA_VOCAB_TYPE_SPM: {
|
11779
11976
|
auto buf = token_data.text.substr(3, 2);
|
@@ -11781,7 +11978,7 @@ static uint8_t llama_token_to_byte(const llama_vocab& vocab, llama_token id) {
|
|
11781
11978
|
}
|
11782
11979
|
case LLAMA_VOCAB_TYPE_BPE: {
|
11783
11980
|
GGML_ASSERT(false);
|
11784
|
-
return unicode_utf8_to_byte(token_data.text);
|
11981
|
+
return unicode_utf8_to_byte(token_data.text); // TODO: why is this here after GGML_ASSERT?
|
11785
11982
|
}
|
11786
11983
|
case LLAMA_VOCAB_TYPE_WPM: {
|
11787
11984
|
GGML_ASSERT(false);
|
@@ -12003,7 +12200,94 @@ struct llm_tokenizer_bpe {
|
|
12003
12200
|
|
12004
12201
|
void tokenize(const std::string & text, std::vector<llama_vocab::id> & output) {
|
12005
12202
|
int final_prev_index = -1;
|
12006
|
-
|
12203
|
+
|
12204
|
+
std::vector<std::string> word_collection;
|
12205
|
+
switch (vocab.type) {
|
12206
|
+
case LLAMA_VOCAB_TYPE_BPE:
|
12207
|
+
switch (vocab.type_pre) {
|
12208
|
+
case LLAMA_VOCAB_PRE_TYPE_LLAMA3:
|
12209
|
+
case LLAMA_VOCAB_PRE_TYPE_DBRX:
|
12210
|
+
word_collection = unicode_regex_split(text, {
|
12211
|
+
// original regex from tokenizer.json
|
12212
|
+
//"(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
|
12213
|
+
|
12214
|
+
// adapted: https://github.com/ggerganov/llama.cpp/pull/6920#issuecomment-2080233989
|
12215
|
+
"(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
|
12216
|
+
});
|
12217
|
+
break;
|
12218
|
+
case LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM:
|
12219
|
+
word_collection = unicode_regex_split(text, {
|
12220
|
+
"[\r\n]",
|
12221
|
+
"\\s?[A-Za-zµÀ-ÖØ-öø-ƺƼ-ƿDŽ-ʓʕ-ʯͰ-ͳͶͷͻ-ͽͿΆΈ-ΊΌΎ-ΡΣ-ϵϷ-ҁҊ-ԯԱ-ՖႠ-ჅᎠ-Ᏽᏸ-ᏽᲐ-ᲺᲽ-Ჿᴀ-ᴫᵫ-ᵷᵹ-ᶚḀ-ἕἘ-Ἕἠ-ὅὈ-Ὅὐ-ὗὙὛὝὟ-ώᾀ-ᾴᾶ-ᾼιῂ-ῄῆ-ῌῐ-ΐῖ-Ίῠ-Ῥῲ-ῴῶ-ῼℂℇℊ-ℓℕℙ-ℝℤΩℨK-ℭℯ-ℴℹℼ-ℿⅅ-ⅉⅎↃↄⰀ-ⱻⱾ-ⳤⳫ-ⳮⳲⳳꙀ-ꙭꚀ-ꚛꜢ-ꝯꝱ-ꞇꞋ-ꞎꭰ-ꮿff-stﬓ-ﬗA-Za-z𐐀-𐑏𐒰-𐓓𐓘-𐓻𐲀-𐲲𐳀-𐳲𑢠-𑣟𞤀-𞥃]+",
|
12222
|
+
"\\s?[!-/:-~!-/:-~‘-‟ -。]+",
|
12223
|
+
"\\s+$",
|
12224
|
+
"[一-龥ࠀ-一가-]+",
|
12225
|
+
"\\p{N}+",
|
12226
|
+
});
|
12227
|
+
break;
|
12228
|
+
case LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER:
|
12229
|
+
word_collection = unicode_regex_split(text, {
|
12230
|
+
"[\r\n]",
|
12231
|
+
"\\s?\\p{L}+",
|
12232
|
+
"\\s?\\p{P}+",
|
12233
|
+
"[一-龥ࠀ-一가-]+",
|
12234
|
+
"\\p{N}",
|
12235
|
+
});
|
12236
|
+
break;
|
12237
|
+
case LLAMA_VOCAB_PRE_TYPE_FALCON:
|
12238
|
+
word_collection = unicode_regex_split(text, {
|
12239
|
+
"[\\p{P}\\$\\+<=>\\^~\\|]+",
|
12240
|
+
"'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
|
12241
|
+
"[0-9][0-9][0-9]",
|
12242
|
+
});
|
12243
|
+
break;
|
12244
|
+
case LLAMA_VOCAB_PRE_TYPE_MPT:
|
12245
|
+
// TODO: MPT pre-tokenization regexes are unknown
|
12246
|
+
// the following are close, but not exact. run the following:
|
12247
|
+
// ./bin/test-tokenizer-0 ../models/ggml-vocab-mpt.gguf
|
12248
|
+
GGML_ASSERT("MPT pre-tokenization regexes are unknown - fixes needed");
|
12249
|
+
word_collection = unicode_regex_split(text, {
|
12250
|
+
"\\s?\\p{L}+",
|
12251
|
+
"\\s?\\p{P}+",
|
12252
|
+
"'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
|
12253
|
+
});
|
12254
|
+
break;
|
12255
|
+
case LLAMA_VOCAB_PRE_TYPE_STARCODER:
|
12256
|
+
case LLAMA_VOCAB_PRE_TYPE_REFACT:
|
12257
|
+
case LLAMA_VOCAB_PRE_TYPE_COMMAND_R:
|
12258
|
+
word_collection = unicode_regex_split(text, {
|
12259
|
+
"\\p{N}",
|
12260
|
+
"'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
|
12261
|
+
});
|
12262
|
+
break;
|
12263
|
+
case LLAMA_VOCAB_PRE_TYPE_GPT2:
|
12264
|
+
case LLAMA_VOCAB_PRE_TYPE_OLMO:
|
12265
|
+
word_collection = unicode_regex_split(text, {
|
12266
|
+
"'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
|
12267
|
+
});
|
12268
|
+
break;
|
12269
|
+
case LLAMA_VOCAB_PRE_TYPE_QWEN2:
|
12270
|
+
word_collection = unicode_regex_split(text, {
|
12271
|
+
// original regex from tokenizer.json
|
12272
|
+
// "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+"
|
12273
|
+
"(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
|
12274
|
+
});
|
12275
|
+
break;
|
12276
|
+
default:
|
12277
|
+
// default regex for BPE tokenization pre-processing
|
12278
|
+
word_collection = unicode_regex_split(text, {
|
12279
|
+
"[\\p{P}\\$\\+<=>\\^~\\|]+",
|
12280
|
+
"'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
|
12281
|
+
"\\p{N}+",
|
12282
|
+
"[0-9][0-9][0-9]",
|
12283
|
+
});
|
12284
|
+
break;
|
12285
|
+
}
|
12286
|
+
break;
|
12287
|
+
default:
|
12288
|
+
GGML_ASSERT(false);
|
12289
|
+
break;
|
12290
|
+
}
|
12007
12291
|
|
12008
12292
|
symbols_final.clear();
|
12009
12293
|
|
@@ -12130,145 +12414,6 @@ private:
|
|
12130
12414
|
work_queue.push(bigram);
|
12131
12415
|
}
|
12132
12416
|
|
12133
|
-
std::vector<std::string> bpe_gpt2_preprocess(const std::string & text) {
|
12134
|
-
std::vector<std::string> bpe_words;
|
12135
|
-
std::vector<std::string> bpe_encoded_words;
|
12136
|
-
|
12137
|
-
std::string token = "";
|
12138
|
-
// GPT2 system regex: 's|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+
|
12139
|
-
bool collecting_numeric = false;
|
12140
|
-
bool collecting_letter = false;
|
12141
|
-
bool collecting_special = false;
|
12142
|
-
bool collecting_whitespace_lookahead = false;
|
12143
|
-
bool collecting = false;
|
12144
|
-
|
12145
|
-
std::vector<std::string> text_utf;
|
12146
|
-
text_utf.reserve(text.size());
|
12147
|
-
bpe_words.reserve(text.size());
|
12148
|
-
bpe_encoded_words.reserve(text.size());
|
12149
|
-
|
12150
|
-
const auto cpts = unicode_cpts_from_utf8(text);
|
12151
|
-
for (size_t i = 0; i < cpts.size(); ++i)
|
12152
|
-
text_utf.emplace_back(unicode_cpt_to_utf8(cpts[i]));
|
12153
|
-
|
12154
|
-
for (int i = 0; i < (int)text_utf.size(); i++) {
|
12155
|
-
const std::string & utf_char = text_utf[i];
|
12156
|
-
bool split_condition = false;
|
12157
|
-
int bytes_remain = text_utf.size() - i;
|
12158
|
-
// forward backward lookups
|
12159
|
-
const std::string & utf_char_next = (i + 1 < (int)text_utf.size()) ? text_utf[i + 1] : "";
|
12160
|
-
const std::string & utf_char_next_next = (i + 2 < (int)text_utf.size()) ? text_utf[i + 2] : "";
|
12161
|
-
|
12162
|
-
// handling contractions
|
12163
|
-
if (!split_condition && bytes_remain >= 2) {
|
12164
|
-
// 's|'t|'m|'d
|
12165
|
-
if (utf_char == "\'" && (utf_char_next == "s" || utf_char_next == "t" || utf_char_next == "m" || utf_char_next == "d")) {
|
12166
|
-
split_condition = true;
|
12167
|
-
}
|
12168
|
-
if (split_condition) {
|
12169
|
-
if (token.size()) {
|
12170
|
-
bpe_words.emplace_back(token); // push previous content as token
|
12171
|
-
}
|
12172
|
-
token = utf_char + utf_char_next;
|
12173
|
-
bpe_words.emplace_back(token);
|
12174
|
-
token = "";
|
12175
|
-
i++;
|
12176
|
-
continue;
|
12177
|
-
}
|
12178
|
-
}
|
12179
|
-
if (!split_condition && bytes_remain >= 3) {
|
12180
|
-
// 're|'ve|'ll
|
12181
|
-
if (utf_char == "\'" && (
|
12182
|
-
(utf_char_next == "r" && utf_char_next_next == "e") ||
|
12183
|
-
(utf_char_next == "v" && utf_char_next_next == "e") ||
|
12184
|
-
(utf_char_next == "l" && utf_char_next_next == "l"))
|
12185
|
-
) {
|
12186
|
-
split_condition = true;
|
12187
|
-
}
|
12188
|
-
if (split_condition) {
|
12189
|
-
// current token + next token can be defined
|
12190
|
-
if (token.size()) {
|
12191
|
-
bpe_words.emplace_back(token); // push previous content as token
|
12192
|
-
}
|
12193
|
-
token = utf_char + utf_char_next + utf_char_next_next;
|
12194
|
-
bpe_words.emplace_back(token); // the contraction
|
12195
|
-
token = "";
|
12196
|
-
i += 2;
|
12197
|
-
continue;
|
12198
|
-
}
|
12199
|
-
}
|
12200
|
-
|
12201
|
-
if (!split_condition && !collecting) {
|
12202
|
-
if (unicode_cpt_type(utf_char) == CODEPOINT_TYPE_LETTER || (!token.size() && utf_char == " " && unicode_cpt_type(utf_char_next) == CODEPOINT_TYPE_LETTER)) {
|
12203
|
-
collecting_letter = true;
|
12204
|
-
collecting = true;
|
12205
|
-
}
|
12206
|
-
else if (unicode_cpt_type(utf_char) == CODEPOINT_TYPE_DIGIT || (!token.size() && utf_char == " " && unicode_cpt_type(utf_char_next) == CODEPOINT_TYPE_DIGIT)) {
|
12207
|
-
collecting_numeric = true;
|
12208
|
-
collecting = true;
|
12209
|
-
}
|
12210
|
-
else if (
|
12211
|
-
((unicode_cpt_type(utf_char) != CODEPOINT_TYPE_LETTER && unicode_cpt_type(utf_char) != CODEPOINT_TYPE_DIGIT) && (unicode_cpt_type(utf_char) != CODEPOINT_TYPE_WHITESPACE)) ||
|
12212
|
-
(!token.size() && utf_char == " " && unicode_cpt_type(utf_char_next) != CODEPOINT_TYPE_LETTER && unicode_cpt_type(utf_char_next) != CODEPOINT_TYPE_DIGIT && unicode_cpt_type(utf_char_next) != CODEPOINT_TYPE_WHITESPACE)
|
12213
|
-
) {
|
12214
|
-
collecting_special = true;
|
12215
|
-
collecting = true;
|
12216
|
-
}
|
12217
|
-
else if (unicode_cpt_type(utf_char) == CODEPOINT_TYPE_WHITESPACE && unicode_cpt_type(utf_char_next) == CODEPOINT_TYPE_WHITESPACE) {
|
12218
|
-
collecting_whitespace_lookahead = true;
|
12219
|
-
collecting = true;
|
12220
|
-
}
|
12221
|
-
else if (unicode_cpt_type(utf_char) == CODEPOINT_TYPE_WHITESPACE) {
|
12222
|
-
split_condition = true;
|
12223
|
-
}
|
12224
|
-
}
|
12225
|
-
else if (!split_condition && collecting) {
|
12226
|
-
if (collecting_letter && unicode_cpt_type(utf_char) != CODEPOINT_TYPE_LETTER) {
|
12227
|
-
split_condition = true;
|
12228
|
-
}
|
12229
|
-
else if (collecting_numeric && unicode_cpt_type(utf_char) != CODEPOINT_TYPE_DIGIT) {
|
12230
|
-
split_condition = true;
|
12231
|
-
}
|
12232
|
-
else if (collecting_special && (unicode_cpt_type(utf_char) == CODEPOINT_TYPE_LETTER || unicode_cpt_type(utf_char) == CODEPOINT_TYPE_DIGIT || unicode_cpt_type(utf_char) == CODEPOINT_TYPE_WHITESPACE)) {
|
12233
|
-
split_condition = true;
|
12234
|
-
}
|
12235
|
-
else if (collecting_whitespace_lookahead && (unicode_cpt_type(utf_char_next) == CODEPOINT_TYPE_LETTER || unicode_cpt_type(utf_char_next) == CODEPOINT_TYPE_DIGIT)) {
|
12236
|
-
split_condition = true;
|
12237
|
-
}
|
12238
|
-
}
|
12239
|
-
|
12240
|
-
if (utf_char_next == "") {
|
12241
|
-
split_condition = true; // final
|
12242
|
-
token += utf_char;
|
12243
|
-
}
|
12244
|
-
|
12245
|
-
if (split_condition) {
|
12246
|
-
if (token.size()) {
|
12247
|
-
bpe_words.emplace_back(token);
|
12248
|
-
}
|
12249
|
-
token = utf_char;
|
12250
|
-
collecting = false;
|
12251
|
-
collecting_letter = false;
|
12252
|
-
collecting_numeric = false;
|
12253
|
-
collecting_special = false;
|
12254
|
-
collecting_whitespace_lookahead = false;
|
12255
|
-
}
|
12256
|
-
else {
|
12257
|
-
token += utf_char;
|
12258
|
-
}
|
12259
|
-
}
|
12260
|
-
|
12261
|
-
for (std::string & word : bpe_words) {
|
12262
|
-
std::string encoded_token = "";
|
12263
|
-
for (char & c : word) {
|
12264
|
-
encoded_token += unicode_byte_to_utf8(c);
|
12265
|
-
}
|
12266
|
-
bpe_encoded_words.emplace_back(encoded_token);
|
12267
|
-
}
|
12268
|
-
|
12269
|
-
return bpe_encoded_words;
|
12270
|
-
}
|
12271
|
-
|
12272
12417
|
const llama_vocab & vocab;
|
12273
12418
|
|
12274
12419
|
std::vector<llm_symbol> symbols;
|
@@ -12343,7 +12488,7 @@ struct llm_tokenizer_wpm {
|
|
12343
12488
|
continue;
|
12344
12489
|
}
|
12345
12490
|
code = unicode_tolower(code);
|
12346
|
-
if (type ==
|
12491
|
+
if (type == CODEPOINT_TYPE_SEPARATOR) {
|
12347
12492
|
code = ' ';
|
12348
12493
|
}
|
12349
12494
|
std::string s = unicode_cpt_to_utf8(code);
|
@@ -12588,7 +12733,7 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
|
|
12588
12733
|
} break;
|
12589
12734
|
case LLAMA_VOCAB_TYPE_BPE:
|
12590
12735
|
{
|
12591
|
-
if (add_special && vocab.special_add_bos
|
12736
|
+
if (add_special && vocab.special_add_bos != 0) {
|
12592
12737
|
GGML_ASSERT(vocab.special_bos_id != -1);
|
12593
12738
|
output.push_back(vocab.special_bos_id);
|
12594
12739
|
}
|
@@ -14030,13 +14175,16 @@ static void llama_tensor_dequantize_internal(
|
|
14030
14175
|
if (qtype.to_float == NULL) {
|
14031
14176
|
throw std::runtime_error(format("type %s unsupported for integer quantization: no dequantization available", ggml_type_name(tensor->type)));
|
14032
14177
|
}
|
14033
|
-
} else if (tensor->type != GGML_TYPE_F16
|
14178
|
+
} else if (tensor->type != GGML_TYPE_F16 &&
|
14179
|
+
tensor->type != GGML_TYPE_BF16) {
|
14034
14180
|
throw std::runtime_error(format("cannot dequantize/convert tensor type %s", ggml_type_name(tensor->type)));
|
14035
14181
|
}
|
14036
14182
|
|
14037
14183
|
if (nthread < 2) {
|
14038
14184
|
if (tensor->type == GGML_TYPE_F16) {
|
14039
14185
|
ggml_fp16_to_fp32_row((ggml_fp16_t *)tensor->data, f32_output, nelements);
|
14186
|
+
} else if (tensor->type == GGML_TYPE_BF16) {
|
14187
|
+
ggml_bf16_to_fp32_row((ggml_bf16_t *)tensor->data, f32_output, nelements);
|
14040
14188
|
} else if (ggml_is_quantized(tensor->type)) {
|
14041
14189
|
qtype.to_float(tensor->data, f32_output, nelements);
|
14042
14190
|
} else {
|
@@ -14045,7 +14193,14 @@ static void llama_tensor_dequantize_internal(
|
|
14045
14193
|
return;
|
14046
14194
|
}
|
14047
14195
|
|
14048
|
-
size_t block_size
|
14196
|
+
size_t block_size;
|
14197
|
+
if (tensor->type == GGML_TYPE_F16 ||
|
14198
|
+
tensor->type == GGML_TYPE_BF16) {
|
14199
|
+
block_size = 1;
|
14200
|
+
} else {
|
14201
|
+
block_size = (size_t)ggml_blck_size(tensor->type);
|
14202
|
+
}
|
14203
|
+
|
14049
14204
|
size_t block_size_bytes = ggml_type_size(tensor->type);
|
14050
14205
|
|
14051
14206
|
GGML_ASSERT(nelements % block_size == 0);
|
@@ -14064,6 +14219,8 @@ static void llama_tensor_dequantize_internal(
|
|
14064
14219
|
auto compute = [qtype] (ggml_type typ, uint8_t * inbuf, float * outbuf, int nels) {
|
14065
14220
|
if (typ == GGML_TYPE_F16) {
|
14066
14221
|
ggml_fp16_to_fp32_row((ggml_fp16_t *)inbuf, outbuf, nels);
|
14222
|
+
} else if (typ == GGML_TYPE_BF16) {
|
14223
|
+
ggml_bf16_to_fp32_row((ggml_bf16_t *)inbuf, outbuf, nels);
|
14067
14224
|
} else {
|
14068
14225
|
qtype.to_float(inbuf, outbuf, nels);
|
14069
14226
|
}
|
@@ -14360,14 +14517,20 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
|
|
14360
14517
|
}
|
14361
14518
|
|
14362
14519
|
static size_t llama_tensor_quantize_internal(enum ggml_type new_type, const float * f32_data, void * new_data, const int64_t chunk_size, int64_t nrows, int64_t n_per_row, const float * imatrix, std::vector<std::thread> & workers, const int nthread) {
|
14363
|
-
std::mutex mutex;
|
14364
|
-
int64_t counter = 0;
|
14365
|
-
size_t new_size = 0;
|
14366
14520
|
if (nthread < 2) {
|
14367
14521
|
// single-thread
|
14368
|
-
|
14522
|
+
size_t new_size = ggml_quantize_chunk(new_type, f32_data, new_data, 0, nrows, n_per_row, imatrix);
|
14523
|
+
if (!ggml_validate_row_data(new_type, new_data, new_size)) {
|
14524
|
+
throw std::runtime_error("quantized data validation failed");
|
14525
|
+
}
|
14526
|
+
return new_size;
|
14369
14527
|
}
|
14370
|
-
|
14528
|
+
|
14529
|
+
std::mutex mutex;
|
14530
|
+
int64_t counter = 0;
|
14531
|
+
size_t new_size = 0;
|
14532
|
+
bool valid = true;
|
14533
|
+
auto compute = [&mutex, &counter, &new_size, &valid, new_type, f32_data, new_data, chunk_size,
|
14371
14534
|
nrows, n_per_row, imatrix]() {
|
14372
14535
|
const int64_t nrows_per_chunk = chunk_size / n_per_row;
|
14373
14536
|
size_t local_size = 0;
|
@@ -14382,7 +14545,17 @@ static size_t llama_tensor_quantize_internal(enum ggml_type new_type, const floa
|
|
14382
14545
|
}
|
14383
14546
|
lock.unlock();
|
14384
14547
|
const int64_t this_nrow = std::min(nrows - first_row, nrows_per_chunk);
|
14385
|
-
|
14548
|
+
size_t this_size = ggml_quantize_chunk(new_type, f32_data, new_data, first_row * n_per_row, this_nrow, n_per_row, imatrix);
|
14549
|
+
local_size += this_size;
|
14550
|
+
|
14551
|
+
// validate the quantized data
|
14552
|
+
const size_t row_size = ggml_row_size(new_type, n_per_row);
|
14553
|
+
void * this_data = (char *) new_data + first_row * row_size;
|
14554
|
+
if (!ggml_validate_row_data(new_type, this_data, this_size)) {
|
14555
|
+
std::unique_lock<std::mutex> lock(mutex);
|
14556
|
+
valid = false;
|
14557
|
+
break;
|
14558
|
+
}
|
14386
14559
|
}
|
14387
14560
|
};
|
14388
14561
|
for (int it = 0; it < nthread - 1; ++it) {
|
@@ -14391,6 +14564,9 @@ static size_t llama_tensor_quantize_internal(enum ggml_type new_type, const floa
|
|
14391
14564
|
compute();
|
14392
14565
|
for (auto & w : workers) { w.join(); }
|
14393
14566
|
workers.clear();
|
14567
|
+
if (!valid) {
|
14568
|
+
throw std::runtime_error("quantized data validation failed");
|
14569
|
+
}
|
14394
14570
|
return new_size;
|
14395
14571
|
}
|
14396
14572
|
|
@@ -14405,6 +14581,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
14405
14581
|
case LLAMA_FTYPE_MOSTLY_Q5_1: default_type = GGML_TYPE_Q5_1; break;
|
14406
14582
|
case LLAMA_FTYPE_MOSTLY_Q8_0: default_type = GGML_TYPE_Q8_0; break;
|
14407
14583
|
case LLAMA_FTYPE_MOSTLY_F16: default_type = GGML_TYPE_F16; break;
|
14584
|
+
case LLAMA_FTYPE_MOSTLY_BF16: default_type = GGML_TYPE_BF16; break;
|
14408
14585
|
case LLAMA_FTYPE_ALL_F32: default_type = GGML_TYPE_F32; break;
|
14409
14586
|
|
14410
14587
|
// K-quants
|
@@ -14453,7 +14630,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
14453
14630
|
auto v = (std::vector<llama_model_kv_override>*)params->kv_overrides;
|
14454
14631
|
kv_overrides = v->data();
|
14455
14632
|
}
|
14456
|
-
llama_model_loader ml(fname_inp, use_mmap, kv_overrides);
|
14633
|
+
llama_model_loader ml(fname_inp, use_mmap, /*check_tensors*/ true, kv_overrides);
|
14457
14634
|
ml.init_mappings(false); // no prefetching
|
14458
14635
|
|
14459
14636
|
llama_model model;
|
@@ -14491,11 +14668,13 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
14491
14668
|
for (auto & o : overrides) {
|
14492
14669
|
if (o.key[0] == 0) break;
|
14493
14670
|
if (o.tag == LLAMA_KV_OVERRIDE_TYPE_FLOAT) {
|
14494
|
-
gguf_set_val_f32(ctx_out, o.key, o.
|
14671
|
+
gguf_set_val_f32(ctx_out, o.key, o.val_f64);
|
14495
14672
|
} else if (o.tag == LLAMA_KV_OVERRIDE_TYPE_INT) {
|
14496
|
-
gguf_set_val_i32(ctx_out, o.key, o.
|
14673
|
+
gguf_set_val_i32(ctx_out, o.key, o.val_i64);
|
14497
14674
|
} else if (o.tag == LLAMA_KV_OVERRIDE_TYPE_BOOL) {
|
14498
|
-
gguf_set_val_bool(ctx_out, o.key, o.
|
14675
|
+
gguf_set_val_bool(ctx_out, o.key, o.val_bool);
|
14676
|
+
} else if (o.tag == LLAMA_KV_OVERRIDE_TYPE_STR) {
|
14677
|
+
gguf_set_val_str(ctx_out, o.key, o.val_str);
|
14499
14678
|
} else {
|
14500
14679
|
LLAMA_LOG_WARN("%s: unknown KV override type for key %s\n", __func__, o.key);
|
14501
14680
|
}
|
@@ -14814,7 +14993,7 @@ static int llama_apply_lora_from_file_internal(
|
|
14814
14993
|
std::unique_ptr<llama_model_loader> ml;
|
14815
14994
|
if (path_base_model) {
|
14816
14995
|
LLAMA_LOG_INFO("%s: loading base model from '%s'\n", __func__, path_base_model);
|
14817
|
-
ml.reset(new llama_model_loader(path_base_model, /*use_mmap*/ true, /*kv_overrides*/ nullptr));
|
14996
|
+
ml.reset(new llama_model_loader(path_base_model, /*use_mmap*/ true, /*check_tensors*/ false, /*kv_overrides*/ nullptr));
|
14818
14997
|
ml->init_mappings(/*prefetch*/ false); // no prefetching
|
14819
14998
|
}
|
14820
14999
|
|
@@ -15073,6 +15252,7 @@ struct llama_model_params llama_model_default_params() {
|
|
15073
15252
|
/*.vocab_only =*/ false,
|
15074
15253
|
/*.use_mmap =*/ true,
|
15075
15254
|
/*.use_mlock =*/ false,
|
15255
|
+
/*.check_tensors =*/ false,
|
15076
15256
|
};
|
15077
15257
|
|
15078
15258
|
#ifdef GGML_USE_METAL
|
@@ -15109,6 +15289,7 @@ struct llama_context_params llama_context_default_params() {
|
|
15109
15289
|
/*.logits_all =*/ false,
|
15110
15290
|
/*.embeddings =*/ false,
|
15111
15291
|
/*.offload_kqv =*/ true,
|
15292
|
+
/*.flash_attn =*/ false,
|
15112
15293
|
/*.abort_callback =*/ nullptr,
|
15113
15294
|
/*.abort_callback_data =*/ nullptr,
|
15114
15295
|
};
|
@@ -15275,6 +15456,7 @@ struct llama_context * llama_new_context_with_model(
|
|
15275
15456
|
cparams.defrag_thold = params.defrag_thold;
|
15276
15457
|
cparams.embeddings = params.embeddings;
|
15277
15458
|
cparams.offload_kqv = params.offload_kqv;
|
15459
|
+
cparams.flash_attn = params.flash_attn;
|
15278
15460
|
cparams.pooling_type = params.pooling_type;
|
15279
15461
|
|
15280
15462
|
cparams.n_ctx = params.n_ctx == 0 ? hparams.n_ctx_train : params.n_ctx;
|
@@ -15282,12 +15464,20 @@ struct llama_context * llama_new_context_with_model(
|
|
15282
15464
|
cparams.rope_freq_scale = params.rope_freq_scale == 0.0f ? hparams.rope_freq_scale_train : params.rope_freq_scale;
|
15283
15465
|
|
15284
15466
|
// this is necessary due to kv_self.n being padded later during inference
|
15285
|
-
cparams.n_ctx
|
15467
|
+
cparams.n_ctx = GGML_PAD(cparams.n_ctx, 256);
|
15286
15468
|
|
15287
15469
|
// with causal attention, the batch size is limited by the context size
|
15288
15470
|
cparams.n_batch = hparams.causal_attn ? std::min(cparams.n_ctx, params.n_batch) : params.n_batch;
|
15289
|
-
cparams.n_ubatch = std::min(cparams.n_batch, params.n_ubatch == 0 ? params.n_batch : params.n_ubatch);
|
15290
15471
|
|
15472
|
+
// the batch has to be at least GGML_KQ_MASK_PAD because we will be padding the KQ_mask
|
15473
|
+
// this is required by GPU kernels in order to avoid out-of-bounds accesses (e.g. ggml_flash_attn_ext)
|
15474
|
+
// ref: https://github.com/ggerganov/llama.cpp/pull/5021
|
15475
|
+
if (cparams.n_batch < GGML_KQ_MASK_PAD) {
|
15476
|
+
LLAMA_LOG_WARN("%s: n_batch is less than GGML_KQ_MASK_PAD - increasing to %d\n", __func__, GGML_KQ_MASK_PAD);
|
15477
|
+
cparams.n_batch = GGML_KQ_MASK_PAD;
|
15478
|
+
}
|
15479
|
+
|
15480
|
+
cparams.n_ubatch = std::min(cparams.n_batch, params.n_ubatch == 0 ? params.n_batch : params.n_ubatch);
|
15291
15481
|
|
15292
15482
|
cparams.n_yarn_orig_ctx = params.yarn_orig_ctx != 0 ? params.yarn_orig_ctx :
|
15293
15483
|
hparams.n_yarn_orig_ctx != 0 ? hparams.n_yarn_orig_ctx :
|
@@ -15319,6 +15509,16 @@ struct llama_context * llama_new_context_with_model(
|
|
15319
15509
|
}
|
15320
15510
|
}
|
15321
15511
|
|
15512
|
+
if (cparams.flash_attn && hparams.use_alibi) {
|
15513
|
+
LLAMA_LOG_WARN("%s: flash_attn is not yet compatible with ALiBi - forcing off\n", __func__);
|
15514
|
+
cparams.flash_attn = false;
|
15515
|
+
}
|
15516
|
+
|
15517
|
+
if (cparams.flash_attn && model->arch == LLM_ARCH_GROK) {
|
15518
|
+
LLAMA_LOG_WARN("%s: flash_attn is not compatible with Grok - forcing off\n", __func__);
|
15519
|
+
cparams.flash_attn = false;
|
15520
|
+
}
|
15521
|
+
|
15322
15522
|
if (params.seed == LLAMA_DEFAULT_SEED) {
|
15323
15523
|
params.seed = time(NULL);
|
15324
15524
|
}
|
@@ -15326,6 +15526,7 @@ struct llama_context * llama_new_context_with_model(
|
|
15326
15526
|
LLAMA_LOG_INFO("%s: n_ctx = %u\n", __func__, cparams.n_ctx);
|
15327
15527
|
LLAMA_LOG_INFO("%s: n_batch = %u\n", __func__, cparams.n_batch);
|
15328
15528
|
LLAMA_LOG_INFO("%s: n_ubatch = %u\n", __func__, cparams.n_ubatch);
|
15529
|
+
LLAMA_LOG_INFO("%s: flash_attn = %d\n", __func__, cparams.flash_attn);
|
15329
15530
|
LLAMA_LOG_INFO("%s: freq_base = %.1f\n", __func__, cparams.rope_freq_base);
|
15330
15531
|
LLAMA_LOG_INFO("%s: freq_scale = %g\n", __func__, cparams.rope_freq_scale);
|
15331
15532
|
|
@@ -15454,7 +15655,7 @@ struct llama_context * llama_new_context_with_model(
|
|
15454
15655
|
}
|
15455
15656
|
ctx->backends.push_back(ctx->backend_cpu);
|
15456
15657
|
|
15457
|
-
if (!llama_kv_cache_init(ctx->kv_self, ctx
|
15658
|
+
if (!llama_kv_cache_init(ctx->kv_self, ctx, type_k, type_v, kv_size, cparams.offload_kqv)) {
|
15458
15659
|
LLAMA_LOG_ERROR("%s: llama_kv_cache_init() failed for self-attention cache\n", __func__);
|
15459
15660
|
llama_free(ctx);
|
15460
15661
|
return nullptr;
|
@@ -16053,6 +16254,7 @@ size_t llama_state_get_size(const struct llama_context * ctx) {
|
|
16053
16254
|
const size_t s_kv_head = sizeof(uint32_t);
|
16054
16255
|
const size_t s_kv_size = sizeof(uint32_t);
|
16055
16256
|
const size_t s_kv_used = sizeof(uint32_t);
|
16257
|
+
const size_t s_v_trans = sizeof(uint32_t);
|
16056
16258
|
const size_t s_kv = ctx->kv_self.total_size();
|
16057
16259
|
const size_t s_kv_cell = sizeof(llama_pos) + sizeof(size_t) + cparams.n_seq_max*sizeof(llama_seq_id);
|
16058
16260
|
const size_t s_kv_cells = ctx->kv_self.size * s_kv_cell;
|
@@ -16070,10 +16272,14 @@ size_t llama_state_get_size(const struct llama_context * ctx) {
|
|
16070
16272
|
+ s_kv_head
|
16071
16273
|
+ s_kv_size
|
16072
16274
|
+ s_kv_used
|
16275
|
+
+ s_v_trans
|
16073
16276
|
+ s_kv
|
16074
16277
|
+ s_kv_cells
|
16075
16278
|
);
|
16076
16279
|
|
16280
|
+
// on session change it is very likely that the state size has changed - so we need to update this function
|
16281
|
+
static_assert(LLAMA_SESSION_VERSION == 6, "So you just bumped the session version - good. But did you remember to update llama_state_get_size?");
|
16282
|
+
|
16077
16283
|
return s_total;
|
16078
16284
|
}
|
16079
16285
|
|
@@ -16219,11 +16425,13 @@ static void llama_state_get_data_internal(struct llama_context * ctx, llama_data
|
|
16219
16425
|
const uint32_t kv_size = kv_self.size;
|
16220
16426
|
const size_t kv_buf_size = kv_self.total_size() / (kv_size ? kv_size : 1) * kv_head;
|
16221
16427
|
const uint32_t kv_used = kv_self.used;
|
16428
|
+
const uint32_t v_trans = kv_self.v_trans ? 1 : 0;
|
16222
16429
|
|
16223
16430
|
data_ctx->write(&kv_buf_size, sizeof(kv_buf_size));
|
16224
16431
|
data_ctx->write(&kv_head, sizeof(kv_head));
|
16225
16432
|
data_ctx->write(&kv_size, sizeof(kv_size));
|
16226
16433
|
data_ctx->write(&kv_used, sizeof(kv_used));
|
16434
|
+
data_ctx->write(&v_trans, sizeof(v_trans));
|
16227
16435
|
|
16228
16436
|
if (kv_buf_size) {
|
16229
16437
|
const size_t pre_kv_buf_size = data_ctx->get_size_written();
|
@@ -16236,7 +16444,7 @@ static void llama_state_get_data_internal(struct llama_context * ctx, llama_data
|
|
16236
16444
|
ggml_backend_tensor_get(kv_self.k_l[il], tmp_buf.data(), 0, tmp_buf.size());
|
16237
16445
|
data_ctx->write(tmp_buf.data(), tmp_buf.size());
|
16238
16446
|
|
16239
|
-
if (kv_self.recurrent) {
|
16447
|
+
if (kv_self.recurrent || !kv_self.v_trans) {
|
16240
16448
|
// v is contiguous for recurrent models
|
16241
16449
|
// TODO: use other tensors for state models than k and v
|
16242
16450
|
const size_t v_size = ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa*kv_head);
|
@@ -16369,11 +16577,15 @@ size_t llama_state_set_data(struct llama_context * ctx, const uint8_t * src) {
|
|
16369
16577
|
uint32_t kv_head;
|
16370
16578
|
uint32_t kv_size;
|
16371
16579
|
uint32_t kv_used;
|
16580
|
+
uint32_t v_trans;
|
16372
16581
|
|
16373
16582
|
memcpy(&kv_buf_size, inp, sizeof(kv_buf_size)); inp += sizeof(kv_buf_size);
|
16374
16583
|
memcpy(&kv_head, inp, sizeof(kv_head)); inp += sizeof(kv_head);
|
16375
16584
|
memcpy(&kv_size, inp, sizeof(kv_size)); inp += sizeof(kv_size);
|
16376
16585
|
memcpy(&kv_used, inp, sizeof(kv_used)); inp += sizeof(kv_used);
|
16586
|
+
memcpy(&v_trans, inp, sizeof(v_trans)); inp += sizeof(v_trans);
|
16587
|
+
|
16588
|
+
GGML_ASSERT(kv_self.v_trans == (bool) v_trans); // incompatible V transposition
|
16377
16589
|
|
16378
16590
|
if (kv_self.size != kv_size) {
|
16379
16591
|
// the KV cache needs to be big enough to load all the KV cells from the saved state
|
@@ -16383,6 +16595,8 @@ size_t llama_state_set_data(struct llama_context * ctx, const uint8_t * src) {
|
|
16383
16595
|
__func__, kv_head, kv_size, kv_self.size);
|
16384
16596
|
}
|
16385
16597
|
|
16598
|
+
llama_kv_cache_clear(ctx);
|
16599
|
+
|
16386
16600
|
if (kv_buf_size) {
|
16387
16601
|
const size_t pre_kv_buf_size = inp - src;
|
16388
16602
|
|
@@ -16394,7 +16608,7 @@ size_t llama_state_set_data(struct llama_context * ctx, const uint8_t * src) {
|
|
16394
16608
|
ggml_backend_tensor_set(kv_self.k_l[il], inp, 0, k_size);
|
16395
16609
|
inp += k_size;
|
16396
16610
|
|
16397
|
-
if (kv_self.recurrent) {
|
16611
|
+
if (kv_self.recurrent || !kv_self.v_trans) {
|
16398
16612
|
// v is contiguous for recurrent models
|
16399
16613
|
// TODO: use other tensors for state models than k and v
|
16400
16614
|
const size_t v_size = ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa*kv_head);
|
@@ -16416,8 +16630,6 @@ size_t llama_state_set_data(struct llama_context * ctx, const uint8_t * src) {
|
|
16416
16630
|
GGML_ASSERT(kv_buf_size == inp - src - pre_kv_buf_size);
|
16417
16631
|
}
|
16418
16632
|
|
16419
|
-
llama_kv_cache_clear(ctx);
|
16420
|
-
|
16421
16633
|
ctx->kv_self.head = kv_head;
|
16422
16634
|
ctx->kv_self.used = kv_used;
|
16423
16635
|
|
@@ -16677,28 +16889,49 @@ static size_t llama_state_seq_get_data_internal(struct llama_context * ctx, llam
|
|
16677
16889
|
}
|
16678
16890
|
}
|
16679
16891
|
|
16680
|
-
//
|
16681
|
-
|
16682
|
-
|
16683
|
-
|
16684
|
-
|
16685
|
-
|
16892
|
+
// TODO: simplify, reduce copy-paste
|
16893
|
+
if (!kv_self.v_trans) {
|
16894
|
+
for (int il = 0; il < (int)n_layer; ++il) {
|
16895
|
+
// Write value type
|
16896
|
+
const int32_t v_type_i = (int32_t)kv_self.v_l[il]->type;
|
16897
|
+
data_ctx.write(&v_type_i, sizeof(v_type_i));
|
16686
16898
|
|
16687
|
-
|
16688
|
-
|
16689
|
-
|
16899
|
+
// Write row size of value
|
16900
|
+
const size_t v_size_row = ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa);
|
16901
|
+
data_ctx.write(&v_size_row, sizeof(v_size_row));
|
16690
16902
|
|
16691
|
-
|
16692
|
-
for (uint32_t j = 0; j < n_embd_v_gqa; ++j) {
|
16693
|
-
// Read each range of cells of v_size_el length each into tmp_buf and write out
|
16903
|
+
// Read each range of cells of v_size length each into tmp_buf and write out
|
16694
16904
|
for (const auto & range : cell_ranges) {
|
16695
16905
|
const size_t range_size = range.second - range.first;
|
16696
|
-
|
16697
|
-
tmp_buf.
|
16698
|
-
ggml_backend_tensor_get(kv_self.v_l[il], tmp_buf.data(), src_offset, tmp_buf.size());
|
16906
|
+
tmp_buf.resize(range_size * v_size_row);
|
16907
|
+
ggml_backend_tensor_get(kv_self.v_l[il], tmp_buf.data(), range.first * v_size_row, range_size * v_size_row);
|
16699
16908
|
data_ctx.write(tmp_buf.data(), tmp_buf.size());
|
16700
16909
|
}
|
16701
16910
|
}
|
16911
|
+
} else {
|
16912
|
+
// For the values, they are transposed, so we also need the element size and get the element ranges from each row
|
16913
|
+
const uint32_t kv_size = kv_self.size;
|
16914
|
+
for (int il = 0; il < (int)n_layer; ++il) {
|
16915
|
+
// Write value type
|
16916
|
+
const int32_t v_type_i = (int32_t)kv_self.v_l[il]->type;
|
16917
|
+
data_ctx.write(&v_type_i, sizeof(v_type_i));
|
16918
|
+
|
16919
|
+
// Write element size
|
16920
|
+
const size_t v_size_el = ggml_type_size(kv_self.v_l[il]->type);
|
16921
|
+
data_ctx.write(&v_size_el, sizeof(v_size_el));
|
16922
|
+
|
16923
|
+
// For each row, we get the element values of each cell
|
16924
|
+
for (uint32_t j = 0; j < n_embd_v_gqa; ++j) {
|
16925
|
+
// Read each range of cells of v_size_el length each into tmp_buf and write out
|
16926
|
+
for (const auto & range : cell_ranges) {
|
16927
|
+
const size_t range_size = range.second - range.first;
|
16928
|
+
const size_t src_offset = (range.first + j * kv_size) * v_size_el;
|
16929
|
+
tmp_buf.resize(range_size * v_size_el);
|
16930
|
+
ggml_backend_tensor_get(kv_self.v_l[il], tmp_buf.data(), src_offset, tmp_buf.size());
|
16931
|
+
data_ctx.write(tmp_buf.data(), tmp_buf.size());
|
16932
|
+
}
|
16933
|
+
}
|
16934
|
+
}
|
16702
16935
|
}
|
16703
16936
|
|
16704
16937
|
return data_ctx.get_size_written();
|
@@ -16823,41 +17056,75 @@ size_t llama_state_seq_set_data(struct llama_context * ctx, const uint8_t * src,
|
|
16823
17056
|
}
|
16824
17057
|
}
|
16825
17058
|
|
16826
|
-
//
|
16827
|
-
|
16828
|
-
|
16829
|
-
|
16830
|
-
|
16831
|
-
|
16832
|
-
|
16833
|
-
|
16834
|
-
|
16835
|
-
|
16836
|
-
|
16837
|
-
|
17059
|
+
// TODO: simplify, reduce copy-paste
|
17060
|
+
if (!kv_self.v_trans) {
|
17061
|
+
for (int il = 0; il < (int)n_layer; ++il) {
|
17062
|
+
// Read type of value
|
17063
|
+
int32_t v_type_i_ref;
|
17064
|
+
memcpy(&v_type_i_ref, inp, sizeof(v_type_i_ref));
|
17065
|
+
inp += sizeof(v_type_i_ref);
|
17066
|
+
const int32_t v_type_i = (int32_t)kv_self.v_l[il]->type;
|
17067
|
+
if (v_type_i != v_type_i_ref) {
|
17068
|
+
llama_kv_cache_seq_rm(kv_self, dest_seq_id, -1, -1);
|
17069
|
+
LLAMA_LOG_ERROR("%s: mismatched value type (%d != %d, layer %d)\n", __func__, v_type_i, v_type_i_ref, il);
|
17070
|
+
return 0;
|
17071
|
+
}
|
16838
17072
|
|
16839
|
-
|
16840
|
-
|
16841
|
-
|
16842
|
-
|
16843
|
-
|
16844
|
-
|
16845
|
-
|
16846
|
-
|
16847
|
-
|
16848
|
-
|
17073
|
+
// Read row size of value
|
17074
|
+
size_t v_size_row_ref;
|
17075
|
+
memcpy(&v_size_row_ref, inp, sizeof(v_size_row_ref));
|
17076
|
+
inp += sizeof(v_size_row_ref);
|
17077
|
+
const size_t v_size_row = ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa);
|
17078
|
+
if (v_size_row != v_size_row_ref) {
|
17079
|
+
llama_kv_cache_seq_rm(kv_self, dest_seq_id, -1, -1);
|
17080
|
+
LLAMA_LOG_ERROR("%s: mismatched value row size (%zu != %zu, layer %d)\n", __func__, v_size_row, v_size_row_ref, il);
|
17081
|
+
return 0;
|
17082
|
+
}
|
16849
17083
|
|
16850
|
-
|
16851
|
-
|
16852
|
-
|
16853
|
-
|
16854
|
-
|
16855
|
-
|
17084
|
+
if (cell_count) {
|
17085
|
+
// Read and set the values for the whole cell range
|
17086
|
+
ggml_backend_tensor_set(kv_self.v_l[il], inp, kv_head * v_size_row, cell_count * v_size_row);
|
17087
|
+
inp += cell_count * v_size_row;
|
17088
|
+
}
|
17089
|
+
}
|
17090
|
+
} else {
|
17091
|
+
// For each layer, read the values for each cell (transposed)
|
17092
|
+
for (int il = 0; il < (int)n_layer; ++il) {
|
17093
|
+
// Read type of value
|
17094
|
+
int32_t v_type_i_ref;
|
17095
|
+
memcpy(&v_type_i_ref, inp, sizeof(v_type_i_ref));
|
17096
|
+
inp += sizeof(v_type_i_ref);
|
17097
|
+
const int32_t v_type_i = (int32_t)kv_self.v_l[il]->type;
|
17098
|
+
if (v_type_i != v_type_i_ref) {
|
17099
|
+
llama_kv_cache_seq_rm(kv_self, dest_seq_id, -1, -1);
|
17100
|
+
LLAMA_LOG_ERROR("%s: mismatched value type (%d != %d, layer %d)\n", __func__, v_type_i, v_type_i_ref, il);
|
17101
|
+
return 0;
|
17102
|
+
}
|
17103
|
+
|
17104
|
+
// Read element size of value
|
17105
|
+
size_t v_size_el_ref;
|
17106
|
+
memcpy(&v_size_el_ref, inp, sizeof(v_size_el_ref));
|
17107
|
+
inp += sizeof(v_size_el_ref);
|
17108
|
+
const size_t v_size_el = ggml_type_size(kv_self.v_l[il]->type);
|
17109
|
+
if (v_size_el != v_size_el_ref) {
|
17110
|
+
llama_kv_cache_seq_rm(kv_self, dest_seq_id, -1, -1);
|
17111
|
+
LLAMA_LOG_ERROR("%s: mismatched value element size (%zu != %zu, layer %d)\n", __func__, v_size_el, v_size_el_ref, il);
|
17112
|
+
return 0;
|
17113
|
+
}
|
17114
|
+
|
17115
|
+
if (cell_count) {
|
17116
|
+
// For each row in the transposed matrix, read the values for the whole cell range
|
17117
|
+
for (uint32_t j = 0; j < n_embd_v_gqa; ++j) {
|
17118
|
+
const size_t dst_offset = (kv_head + j * kv_size) * v_size_el;
|
17119
|
+
ggml_backend_tensor_set(kv_self.v_l[il], inp, dst_offset, cell_count * v_size_el);
|
17120
|
+
inp += cell_count * v_size_el;
|
17121
|
+
}
|
16856
17122
|
}
|
16857
17123
|
}
|
16858
17124
|
}
|
16859
17125
|
|
16860
17126
|
const size_t nread = inp - src;
|
17127
|
+
|
16861
17128
|
return nread;
|
16862
17129
|
}
|
16863
17130
|
|
@@ -17238,9 +17505,10 @@ int32_t llama_tokenize(
|
|
17238
17505
|
|
17239
17506
|
static std::string llama_decode_text(const std::string & text) {
|
17240
17507
|
std::string decoded_text;
|
17241
|
-
|
17242
|
-
|
17243
|
-
|
17508
|
+
|
17509
|
+
const auto cpts = unicode_cpts_from_utf8(text);
|
17510
|
+
for (const auto cpt : cpts) {
|
17511
|
+
decoded_text += unicode_utf8_to_byte(unicode_cpt_to_utf8(cpt));
|
17244
17512
|
}
|
17245
17513
|
|
17246
17514
|
return decoded_text;
|
@@ -17604,7 +17872,7 @@ struct llama_timings llama_get_timings(struct llama_context * ctx) {
|
|
17604
17872
|
/*.t_eval_ms =*/ 1e-3 * ctx->t_eval_us,
|
17605
17873
|
|
17606
17874
|
/*.n_sample =*/ std::max(1, ctx->n_sample),
|
17607
|
-
/*.n_p_eval =*/ std::max(
|
17875
|
+
/*.n_p_eval =*/ std::max(0, ctx->n_p_eval),
|
17608
17876
|
/*.n_eval =*/ std::max(1, ctx->n_eval),
|
17609
17877
|
};
|
17610
17878
|
|
@@ -17654,9 +17922,9 @@ const char * llama_print_system_info(void) {
|
|
17654
17922
|
s += "VSX = " + std::to_string(ggml_cpu_has_vsx()) + " | ";
|
17655
17923
|
s += "MATMUL_INT8 = " + std::to_string(ggml_cpu_has_matmul_int8()) + " | ";
|
17656
17924
|
#ifdef GGML_USE_LLAMAFILE
|
17657
|
-
s += "
|
17925
|
+
s += "LLAMAFILE = 1 | ";
|
17658
17926
|
#else
|
17659
|
-
s += "
|
17927
|
+
s += "LLAMAFILE = 0 | ";
|
17660
17928
|
#endif
|
17661
17929
|
|
17662
17930
|
return s.c_str();
|