llama_cpp 0.14.7 → 0.15.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +19 -0
- data/README.md +2 -2
- data/ext/llama_cpp/extconf.rb +2 -1
- data/ext/llama_cpp/llama_cpp.cpp +59 -9
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +24 -3
- data/vendor/tmp/llama.cpp/Makefile +42 -18
- data/vendor/tmp/llama.cpp/ggml-backend.c +7 -5
- data/vendor/tmp/llama.cpp/ggml-cuda.cu +295 -17
- data/vendor/tmp/llama.cpp/ggml-impl.h +78 -1
- data/vendor/tmp/llama.cpp/ggml-kompute.cpp +7 -0
- data/vendor/tmp/llama.cpp/ggml-metal.m +399 -184
- data/vendor/tmp/llama.cpp/ggml-metal.metal +654 -18
- data/vendor/tmp/llama.cpp/ggml-opencl.cpp +1 -0
- data/vendor/tmp/llama.cpp/ggml-quants.c +302 -0
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +28 -16
- data/vendor/tmp/llama.cpp/ggml-vulkan-shaders.hpp +46843 -39205
- data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +951 -263
- data/vendor/tmp/llama.cpp/ggml.c +1457 -92
- data/vendor/tmp/llama.cpp/ggml.h +37 -7
- data/vendor/tmp/llama.cpp/llama.cpp +671 -403
- data/vendor/tmp/llama.cpp/llama.h +34 -10
- data/vendor/tmp/llama.cpp/sgemm.cpp +134 -103
- data/vendor/tmp/llama.cpp/sgemm.h +4 -2
- data/vendor/tmp/llama.cpp/unicode-data.cpp +1188 -656
- data/vendor/tmp/llama.cpp/unicode-data.h +4 -3
- data/vendor/tmp/llama.cpp/unicode.cpp +590 -49
- data/vendor/tmp/llama.cpp/unicode.h +6 -3
- metadata +3 -3
@@ -75,6 +75,7 @@
|
|
75
75
|
#include <forward_list>
|
76
76
|
#include <fstream>
|
77
77
|
#include <functional>
|
78
|
+
#include <future>
|
78
79
|
#include <initializer_list>
|
79
80
|
#include <locale>
|
80
81
|
#include <map>
|
@@ -107,7 +108,6 @@
|
|
107
108
|
#define LLAMA_MAX_NODES 8192
|
108
109
|
#define LLAMA_MAX_EXPERTS 60
|
109
110
|
|
110
|
-
|
111
111
|
//
|
112
112
|
// logging
|
113
113
|
//
|
@@ -316,6 +316,7 @@ enum llm_kv {
|
|
316
316
|
LLM_KV_SSM_TIME_STEP_RANK,
|
317
317
|
|
318
318
|
LLM_KV_TOKENIZER_MODEL,
|
319
|
+
LLM_KV_TOKENIZER_PRE,
|
319
320
|
LLM_KV_TOKENIZER_LIST,
|
320
321
|
LLM_KV_TOKENIZER_TOKEN_TYPE,
|
321
322
|
LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT,
|
@@ -392,6 +393,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
|
392
393
|
{ LLM_KV_SSM_TIME_STEP_RANK, "%s.ssm.time_step_rank" },
|
393
394
|
|
394
395
|
{ LLM_KV_TOKENIZER_MODEL, "tokenizer.ggml.model" },
|
396
|
+
{ LLM_KV_TOKENIZER_PRE, "tokenizer.ggml.pre" },
|
395
397
|
{ LLM_KV_TOKENIZER_LIST, "tokenizer.ggml.tokens" },
|
396
398
|
{ LLM_KV_TOKENIZER_TOKEN_TYPE, "tokenizer.ggml.token_type" },
|
397
399
|
{ LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT, "tokenizer.ggml.token_type_count" },
|
@@ -1843,7 +1845,7 @@ struct llama_hparams {
|
|
1843
1845
|
float f_logit_scale = 0.0f;
|
1844
1846
|
|
1845
1847
|
bool causal_attn = true;
|
1846
|
-
bool
|
1848
|
+
bool use_alibi = false; // currently, we need KQ_pos data for ALiBi-based models
|
1847
1849
|
|
1848
1850
|
enum llama_pooling_type pooling_type = LLAMA_POOLING_TYPE_NONE;
|
1849
1851
|
enum llama_rope_type rope_type = LLAMA_ROPE_TYPE_NONE;
|
@@ -1933,6 +1935,7 @@ struct llama_cparams {
|
|
1933
1935
|
bool embeddings;
|
1934
1936
|
bool causal_attn;
|
1935
1937
|
bool offload_kqv;
|
1938
|
+
bool flash_attn;
|
1936
1939
|
|
1937
1940
|
enum llama_pooling_type pooling_type;
|
1938
1941
|
|
@@ -2036,8 +2039,8 @@ struct llama_kv_cache {
|
|
2036
2039
|
bool has_shift = false;
|
2037
2040
|
bool do_defrag = false;
|
2038
2041
|
bool do_copy = false;
|
2039
|
-
// with recurrent state models, a cell can hold the state for more than one past token
|
2040
|
-
bool
|
2042
|
+
bool recurrent = false; // with recurrent state models, a cell can hold the state for more than one past token
|
2043
|
+
bool v_trans = true; // the value tensor is transposed
|
2041
2044
|
|
2042
2045
|
// Note: The value of head isn't only used to optimize searching
|
2043
2046
|
// for a free KV slot. llama_decode_internal also uses it, so it
|
@@ -2114,7 +2117,8 @@ struct llama_vocab {
|
|
2114
2117
|
ttype type;
|
2115
2118
|
};
|
2116
2119
|
|
2117
|
-
enum llama_vocab_type
|
2120
|
+
enum llama_vocab_type type = LLAMA_VOCAB_TYPE_SPM;
|
2121
|
+
enum llama_vocab_pre_type type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
|
2118
2122
|
|
2119
2123
|
std::unordered_map<token, id> token_to_id;
|
2120
2124
|
std::vector<token_data> id_to_token;
|
@@ -2335,11 +2339,14 @@ struct llama_context {
|
|
2335
2339
|
|
2336
2340
|
static bool llama_kv_cache_init(
|
2337
2341
|
struct llama_kv_cache & cache,
|
2338
|
-
|
2342
|
+
const llama_context * ctx,
|
2339
2343
|
ggml_type type_k,
|
2340
2344
|
ggml_type type_v,
|
2341
2345
|
uint32_t kv_size,
|
2342
2346
|
bool offload) {
|
2347
|
+
const llama_model & model = ctx->model;
|
2348
|
+
const llama_cparams & cparams = ctx->cparams;
|
2349
|
+
|
2343
2350
|
const struct llama_hparams & hparams = model.hparams;
|
2344
2351
|
|
2345
2352
|
const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa() + hparams.n_embd_k_s();
|
@@ -2350,8 +2357,9 @@ static bool llama_kv_cache_init(
|
|
2350
2357
|
|
2351
2358
|
// TODO: find a nicer way to add other recurrent model architectures
|
2352
2359
|
cache.recurrent = model.arch == LLM_ARCH_MAMBA;
|
2360
|
+
cache.v_trans = !cparams.flash_attn;
|
2353
2361
|
|
2354
|
-
// TODO: support mixed
|
2362
|
+
// TODO: support mixed recurrent Transformer architectures
|
2355
2363
|
// NOTE: (!a || b) is a logical implication (a -> b)
|
2356
2364
|
GGML_ASSERT(!cache.recurrent || n_embd_k_gqa == hparams.n_embd_k_s());
|
2357
2365
|
GGML_ASSERT(!cache.recurrent || n_embd_v_gqa == hparams.n_embd_v_s());
|
@@ -2562,6 +2570,10 @@ static void llama_kv_cache_clear(struct llama_kv_cache & cache) {
|
|
2562
2570
|
}
|
2563
2571
|
cache.head = 0;
|
2564
2572
|
cache.used = 0;
|
2573
|
+
|
2574
|
+
for (auto & buf : cache.bufs) {
|
2575
|
+
ggml_backend_buffer_clear(buf, 0);
|
2576
|
+
}
|
2565
2577
|
}
|
2566
2578
|
|
2567
2579
|
static bool llama_kv_cache_seq_rm(
|
@@ -2882,6 +2894,7 @@ namespace GGUFMeta {
|
|
2882
2894
|
case LLAMA_KV_OVERRIDE_TYPE_BOOL: return "bool";
|
2883
2895
|
case LLAMA_KV_OVERRIDE_TYPE_INT: return "int";
|
2884
2896
|
case LLAMA_KV_OVERRIDE_TYPE_FLOAT: return "float";
|
2897
|
+
case LLAMA_KV_OVERRIDE_TYPE_STR: return "str";
|
2885
2898
|
}
|
2886
2899
|
return "unknown";
|
2887
2900
|
}
|
@@ -2893,13 +2906,16 @@ namespace GGUFMeta {
|
|
2893
2906
|
__func__, override_type_to_str(ovrd->tag), ovrd->key);
|
2894
2907
|
switch (ovrd->tag) {
|
2895
2908
|
case LLAMA_KV_OVERRIDE_TYPE_BOOL: {
|
2896
|
-
LLAMA_LOG_INFO("%s\n", ovrd->
|
2909
|
+
LLAMA_LOG_INFO("%s\n", ovrd->val_bool ? "true" : "false");
|
2897
2910
|
} break;
|
2898
2911
|
case LLAMA_KV_OVERRIDE_TYPE_INT: {
|
2899
|
-
LLAMA_LOG_INFO("%" PRId64 "\n", ovrd->
|
2912
|
+
LLAMA_LOG_INFO("%" PRId64 "\n", ovrd->val_i64);
|
2900
2913
|
} break;
|
2901
2914
|
case LLAMA_KV_OVERRIDE_TYPE_FLOAT: {
|
2902
|
-
LLAMA_LOG_INFO("%.6f\n", ovrd->
|
2915
|
+
LLAMA_LOG_INFO("%.6f\n", ovrd->val_f64);
|
2916
|
+
} break;
|
2917
|
+
case LLAMA_KV_OVERRIDE_TYPE_STR: {
|
2918
|
+
LLAMA_LOG_INFO("%s\n", ovrd->val_str);
|
2903
2919
|
} break;
|
2904
2920
|
default:
|
2905
2921
|
// Shouldn't be possible to end up here, but just in case...
|
@@ -2918,7 +2934,7 @@ namespace GGUFMeta {
|
|
2918
2934
|
static typename std::enable_if<std::is_same<OT, bool>::value, bool>::type
|
2919
2935
|
try_override(OT & target, const struct llama_model_kv_override * ovrd) {
|
2920
2936
|
if (validate_override(LLAMA_KV_OVERRIDE_TYPE_BOOL, ovrd)) {
|
2921
|
-
target = ovrd->
|
2937
|
+
target = ovrd->val_bool;
|
2922
2938
|
return true;
|
2923
2939
|
}
|
2924
2940
|
return false;
|
@@ -2928,7 +2944,7 @@ namespace GGUFMeta {
|
|
2928
2944
|
static typename std::enable_if<!std::is_same<OT, bool>::value && std::is_integral<OT>::value, bool>::type
|
2929
2945
|
try_override(OT & target, const struct llama_model_kv_override * ovrd) {
|
2930
2946
|
if (validate_override(LLAMA_KV_OVERRIDE_TYPE_INT, ovrd)) {
|
2931
|
-
target = ovrd->
|
2947
|
+
target = ovrd->val_i64;
|
2932
2948
|
return true;
|
2933
2949
|
}
|
2934
2950
|
return false;
|
@@ -2938,7 +2954,7 @@ namespace GGUFMeta {
|
|
2938
2954
|
static typename std::enable_if<std::is_floating_point<OT>::value, bool>::type
|
2939
2955
|
try_override(T & target, const struct llama_model_kv_override * ovrd) {
|
2940
2956
|
if (validate_override(LLAMA_KV_OVERRIDE_TYPE_FLOAT, ovrd)) {
|
2941
|
-
target = ovrd->
|
2957
|
+
target = ovrd->val_f64;
|
2942
2958
|
return true;
|
2943
2959
|
}
|
2944
2960
|
return false;
|
@@ -2947,12 +2963,11 @@ namespace GGUFMeta {
|
|
2947
2963
|
template<typename OT>
|
2948
2964
|
static typename std::enable_if<std::is_same<OT, std::string>::value, bool>::type
|
2949
2965
|
try_override(T & target, const struct llama_model_kv_override * ovrd) {
|
2950
|
-
(
|
2951
|
-
|
2952
|
-
|
2953
|
-
|
2954
|
-
|
2955
|
-
ovrd ? ovrd->key : "NULL"));
|
2966
|
+
if (validate_override(LLAMA_KV_OVERRIDE_TYPE_STR, ovrd)) {
|
2967
|
+
target = ovrd->val_str;
|
2968
|
+
return true;
|
2969
|
+
}
|
2970
|
+
return false;
|
2956
2971
|
}
|
2957
2972
|
|
2958
2973
|
static bool set(const gguf_context * ctx, const int k, T & target, const struct llama_model_kv_override * ovrd = nullptr) {
|
@@ -2985,6 +3000,7 @@ struct llama_model_loader {
|
|
2985
3000
|
size_t n_bytes = 0;
|
2986
3001
|
|
2987
3002
|
bool use_mmap = false;
|
3003
|
+
bool check_tensors;
|
2988
3004
|
|
2989
3005
|
llama_files files;
|
2990
3006
|
llama_ftype ftype;
|
@@ -3018,7 +3034,7 @@ struct llama_model_loader {
|
|
3018
3034
|
std::string arch_name;
|
3019
3035
|
LLM_KV llm_kv = LLM_KV(LLM_ARCH_UNKNOWN);
|
3020
3036
|
|
3021
|
-
llama_model_loader(const std::string & fname, bool use_mmap, const struct llama_model_kv_override * param_overrides_p) {
|
3037
|
+
llama_model_loader(const std::string & fname, bool use_mmap, bool check_tensors, const struct llama_model_kv_override * param_overrides_p) {
|
3022
3038
|
int trace = 0;
|
3023
3039
|
if (getenv("LLAMA_TRACE")) {
|
3024
3040
|
trace = atoi(getenv("LLAMA_TRACE"));
|
@@ -3115,9 +3131,17 @@ struct llama_model_loader {
|
|
3115
3131
|
|
3116
3132
|
fver = (enum llama_fver) gguf_get_version(meta);
|
3117
3133
|
|
3134
|
+
std::set<std::string> tensor_names;
|
3118
3135
|
for (auto & w : weights) {
|
3119
3136
|
n_elements += ggml_nelements(w.tensor);
|
3120
3137
|
n_bytes += ggml_nbytes(w.tensor);
|
3138
|
+
// make sure there is no duplicated tensor names
|
3139
|
+
const std::string name(w.tensor->name);
|
3140
|
+
auto found = tensor_names.find(name);
|
3141
|
+
if (found != tensor_names.end()) {
|
3142
|
+
throw std::runtime_error(format("invalid model: tensor '%s' is duplicated", w.tensor->name));
|
3143
|
+
}
|
3144
|
+
tensor_names.insert(name);
|
3121
3145
|
}
|
3122
3146
|
|
3123
3147
|
LLAMA_LOG_INFO("%s: loaded meta data with %d key-value pairs and %d tensors from %s (version %s)\n",
|
@@ -3151,6 +3175,7 @@ struct llama_model_loader {
|
|
3151
3175
|
switch (type_max) {
|
3152
3176
|
case GGML_TYPE_F32: ftype = LLAMA_FTYPE_ALL_F32; break;
|
3153
3177
|
case GGML_TYPE_F16: ftype = LLAMA_FTYPE_MOSTLY_F16; break;
|
3178
|
+
case GGML_TYPE_BF16: ftype = LLAMA_FTYPE_MOSTLY_BF16; break;
|
3154
3179
|
case GGML_TYPE_Q4_0: ftype = LLAMA_FTYPE_MOSTLY_Q4_0; break;
|
3155
3180
|
case GGML_TYPE_Q4_1: ftype = LLAMA_FTYPE_MOSTLY_Q4_1; break;
|
3156
3181
|
case GGML_TYPE_Q5_0: ftype = LLAMA_FTYPE_MOSTLY_Q5_0; break;
|
@@ -3223,6 +3248,7 @@ struct llama_model_loader {
|
|
3223
3248
|
}
|
3224
3249
|
|
3225
3250
|
this->use_mmap = use_mmap;
|
3251
|
+
this->check_tensors = check_tensors;
|
3226
3252
|
}
|
3227
3253
|
|
3228
3254
|
~llama_model_loader() {
|
@@ -3481,6 +3507,10 @@ struct llama_model_loader {
|
|
3481
3507
|
file->seek(w.offs, SEEK_SET);
|
3482
3508
|
file->read_raw(cur->data, ggml_nbytes(cur));
|
3483
3509
|
}
|
3510
|
+
|
3511
|
+
if (check_tensors && !ggml_validate_row_data(cur->type, cur->data, ggml_nbytes(cur))) {
|
3512
|
+
throw std::runtime_error(format("tensor '%s' has invalid data", ggml_get_name(cur)));
|
3513
|
+
}
|
3484
3514
|
}
|
3485
3515
|
|
3486
3516
|
size_t size_done = 0;
|
@@ -3497,6 +3527,8 @@ struct llama_model_loader {
|
|
3497
3527
|
GGML_ASSERT(size_data != 0 && "call init_mappings() first");
|
3498
3528
|
|
3499
3529
|
std::vector<no_init<uint8_t>> read_buf;
|
3530
|
+
std::vector<std::future<std::pair<ggml_tensor *, bool>>> validation_result;
|
3531
|
+
|
3500
3532
|
for (struct ggml_tensor * cur = ggml_get_first_tensor(ctx); cur != NULL; cur = ggml_get_next_tensor(ctx, cur)) {
|
3501
3533
|
const auto * weight = get_weight(ggml_get_name(cur));
|
3502
3534
|
if (weight == nullptr) {
|
@@ -3518,37 +3550,66 @@ struct llama_model_loader {
|
|
3518
3550
|
if (bufs_mmap.count(weight->idx)) {
|
3519
3551
|
buf_mmap = bufs_mmap.at(weight->idx);
|
3520
3552
|
}
|
3553
|
+
uint8_t * data = (uint8_t *) mapping->addr + weight->offs;
|
3554
|
+
|
3555
|
+
if (check_tensors) {
|
3556
|
+
validation_result.emplace_back(std::async(std::launch::async, [cur, data, n_size] {
|
3557
|
+
return std::make_pair(cur, ggml_validate_row_data(cur->type, data, n_size));
|
3558
|
+
}));
|
3559
|
+
}
|
3560
|
+
|
3521
3561
|
GGML_ASSERT(buf_mmap || cur->data); // either we have a buffer to allocate the tensor in, or it is already allocated
|
3522
3562
|
if (buf_mmap && cur->data == nullptr) {
|
3523
|
-
ggml_backend_tensor_alloc(buf_mmap, cur,
|
3563
|
+
ggml_backend_tensor_alloc(buf_mmap, cur, data);
|
3524
3564
|
if (lmlocks) {
|
3525
3565
|
const auto & lmlock = lmlocks->at(weight->idx);
|
3526
|
-
lmlock->grow_to(weight->offs +
|
3566
|
+
lmlock->grow_to(weight->offs + n_size);
|
3527
3567
|
}
|
3528
3568
|
|
3529
3569
|
auto & mmap_used = mmaps_used[weight->idx];
|
3530
3570
|
mmap_used.first = std::min(mmap_used.first, weight->offs);
|
3531
3571
|
mmap_used.second = std::max(mmap_used.second, weight->offs + n_size);
|
3532
3572
|
} else {
|
3533
|
-
ggml_backend_tensor_set(cur,
|
3573
|
+
ggml_backend_tensor_set(cur, data, 0, n_size);
|
3534
3574
|
}
|
3535
3575
|
} else {
|
3536
3576
|
GGML_ASSERT(weight->idx < files.size());
|
3537
3577
|
const auto & file = files.at(weight->idx);
|
3538
3578
|
if (ggml_backend_buffer_is_host(cur->buffer)) {
|
3539
3579
|
file->seek(weight->offs, SEEK_SET);
|
3540
|
-
file->read_raw(cur->data,
|
3580
|
+
file->read_raw(cur->data, n_size);
|
3581
|
+
if (check_tensors) {
|
3582
|
+
validation_result.emplace_back(std::async(std::launch::async, [cur, n_size] {
|
3583
|
+
return std::make_pair(cur, ggml_validate_row_data(cur->type, cur->data, n_size));
|
3584
|
+
}));
|
3585
|
+
}
|
3541
3586
|
} else {
|
3542
|
-
read_buf.resize(
|
3587
|
+
read_buf.resize(n_size);
|
3543
3588
|
file->seek(weight->offs, SEEK_SET);
|
3544
|
-
file->read_raw(read_buf.data(),
|
3589
|
+
file->read_raw(read_buf.data(), n_size);
|
3545
3590
|
ggml_backend_tensor_set(cur, read_buf.data(), 0, n_size);
|
3591
|
+
if (check_tensors && !ggml_validate_row_data(cur->type, read_buf.data(), n_size)) {
|
3592
|
+
throw std::runtime_error(format("tensor '%s' has invalid data", ggml_get_name(cur)));
|
3593
|
+
}
|
3546
3594
|
}
|
3547
3595
|
}
|
3548
3596
|
|
3549
3597
|
size_done += n_size;
|
3550
3598
|
}
|
3551
3599
|
|
3600
|
+
// check validation results
|
3601
|
+
bool validation_failed = false;
|
3602
|
+
for (auto & future : validation_result) {
|
3603
|
+
auto result = future.get();
|
3604
|
+
if (!result.second) {
|
3605
|
+
LLAMA_LOG_ERROR("%s: tensor '%s' has invalid data\n", __func__, ggml_get_name(result.first));
|
3606
|
+
validation_failed = true;
|
3607
|
+
}
|
3608
|
+
}
|
3609
|
+
if (validation_failed) {
|
3610
|
+
throw std::runtime_error("found tensors with invalid data");
|
3611
|
+
}
|
3612
|
+
|
3552
3613
|
// check if this is the last call and do final cleanup
|
3553
3614
|
if (size_done >= size_data) {
|
3554
3615
|
// unmap offloaded tensors and metadata
|
@@ -3606,6 +3667,7 @@ static std::string llama_model_ftype_name(llama_ftype ftype) {
|
|
3606
3667
|
switch (ftype) {
|
3607
3668
|
case LLAMA_FTYPE_ALL_F32: return "all F32";
|
3608
3669
|
case LLAMA_FTYPE_MOSTLY_F16: return "F16";
|
3670
|
+
case LLAMA_FTYPE_MOSTLY_BF16: return "BF16";
|
3609
3671
|
case LLAMA_FTYPE_MOSTLY_Q4_0: return "Q4_0";
|
3610
3672
|
case LLAMA_FTYPE_MOSTLY_Q4_1: return "Q4_1";
|
3611
3673
|
case LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16:
|
@@ -4142,7 +4204,7 @@ static void llm_load_hparams(
|
|
4142
4204
|
model.ftype = ml.ftype;
|
4143
4205
|
|
4144
4206
|
if (hparams.f_max_alibi_bias > 0.0f) {
|
4145
|
-
hparams.
|
4207
|
+
hparams.use_alibi = true;
|
4146
4208
|
}
|
4147
4209
|
|
4148
4210
|
hparams.rope_type = llama_rope_type(&model);
|
@@ -4165,11 +4227,13 @@ static void llm_load_vocab(
|
|
4165
4227
|
|
4166
4228
|
// determine vocab type
|
4167
4229
|
{
|
4168
|
-
std::string
|
4230
|
+
std::string tokenizer_model;
|
4231
|
+
std::string tokenizer_pre;
|
4169
4232
|
|
4170
|
-
ml.get_key(LLM_KV_TOKENIZER_MODEL,
|
4233
|
+
ml.get_key(LLM_KV_TOKENIZER_MODEL, tokenizer_model);
|
4234
|
+
ml.get_key(LLM_KV_TOKENIZER_PRE, tokenizer_pre, false);
|
4171
4235
|
|
4172
|
-
if (
|
4236
|
+
if (tokenizer_model == "no_vocab") {
|
4173
4237
|
vocab.type = LLAMA_VOCAB_TYPE_NONE;
|
4174
4238
|
|
4175
4239
|
// default special tokens
|
@@ -4183,7 +4247,7 @@ static void llm_load_vocab(
|
|
4183
4247
|
vocab.linefeed_id = -1;
|
4184
4248
|
|
4185
4249
|
return;
|
4186
|
-
} else if (
|
4250
|
+
} else if (tokenizer_model == "llama") {
|
4187
4251
|
vocab.type = LLAMA_VOCAB_TYPE_SPM;
|
4188
4252
|
|
4189
4253
|
// default special tokens
|
@@ -4228,9 +4292,27 @@ static void llm_load_vocab(
|
|
4228
4292
|
if (add_space_prefix_keyidx != -1) {
|
4229
4293
|
vocab.add_space_prefix = gguf_get_val_bool(ctx, add_space_prefix_keyidx);
|
4230
4294
|
} // The default value of add_space_prefix is true.
|
4231
|
-
} else if (
|
4232
|
-
vocab.type =
|
4295
|
+
} else if (tokenizer_model == "bert") {
|
4296
|
+
vocab.type = LLAMA_VOCAB_TYPE_WPM;
|
4233
4297
|
|
4298
|
+
// default special tokens
|
4299
|
+
vocab.special_bos_id = -1;
|
4300
|
+
vocab.special_eos_id = -1;
|
4301
|
+
vocab.special_unk_id = 100;
|
4302
|
+
vocab.special_sep_id = 102;
|
4303
|
+
vocab.special_pad_id = 0;
|
4304
|
+
vocab.special_cls_id = 101;
|
4305
|
+
vocab.special_mask_id = 103;
|
4306
|
+
vocab.add_space_prefix = false;
|
4307
|
+
} else {
|
4308
|
+
if (tokenizer_model == "gpt2") {
|
4309
|
+
vocab.type = LLAMA_VOCAB_TYPE_BPE;
|
4310
|
+
} else {
|
4311
|
+
LLAMA_LOG_WARN("%s: unknown tokenizer: '%s'", __func__, tokenizer_model.c_str());
|
4312
|
+
LLAMA_LOG_WARN("%s: using default tokenizer: 'llama'", __func__);
|
4313
|
+
vocab.type = LLAMA_VOCAB_TYPE_SPM;
|
4314
|
+
return;
|
4315
|
+
}
|
4234
4316
|
// read bpe merges and populate bpe ranks
|
4235
4317
|
const int merges_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_MERGES).c_str());
|
4236
4318
|
if (merges_keyidx == -1) {
|
@@ -4264,23 +4346,65 @@ static void llm_load_vocab(
|
|
4264
4346
|
vocab.special_pad_id = -1;
|
4265
4347
|
vocab.special_cls_id = -1;
|
4266
4348
|
vocab.special_mask_id = -1;
|
4267
|
-
}
|
4268
|
-
vocab.type = LLAMA_VOCAB_TYPE_WPM;
|
4349
|
+
}
|
4269
4350
|
|
4270
|
-
|
4271
|
-
|
4272
|
-
|
4273
|
-
|
4274
|
-
|
4275
|
-
|
4276
|
-
|
4277
|
-
|
4278
|
-
|
4351
|
+
// for now, only BPE models have pre-tokenizers
|
4352
|
+
if (vocab.type == LLAMA_VOCAB_TYPE_BPE) {
|
4353
|
+
if (tokenizer_pre.empty()) {
|
4354
|
+
LLAMA_LOG_WARN("%s: missing pre-tokenizer type, using: 'default'\n", __func__);
|
4355
|
+
LLAMA_LOG_WARN("%s: \n", __func__);
|
4356
|
+
LLAMA_LOG_WARN("%s: ************************************ \n", __func__);
|
4357
|
+
LLAMA_LOG_WARN("%s: GENERATION QUALITY WILL BE DEGRADED! \n", __func__);
|
4358
|
+
LLAMA_LOG_WARN("%s: CONSIDER REGENERATING THE MODEL \n", __func__);
|
4359
|
+
LLAMA_LOG_WARN("%s: ************************************ \n", __func__);
|
4360
|
+
LLAMA_LOG_WARN("%s: \n", __func__);
|
4361
|
+
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
|
4362
|
+
} else if (
|
4363
|
+
tokenizer_pre == "default") {
|
4364
|
+
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
|
4365
|
+
} else if (
|
4366
|
+
tokenizer_pre == "llama3" ||
|
4367
|
+
tokenizer_pre == "llama-v3" ||
|
4368
|
+
tokenizer_pre == "llama-bpe") {
|
4369
|
+
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_LLAMA3;
|
4370
|
+
} else if (
|
4371
|
+
tokenizer_pre == "deepseek-llm") {
|
4372
|
+
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM;
|
4373
|
+
} else if (
|
4374
|
+
tokenizer_pre == "deepseek-coder") {
|
4375
|
+
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER;
|
4376
|
+
} else if (
|
4377
|
+
tokenizer_pre == "falcon") {
|
4378
|
+
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_FALCON;
|
4379
|
+
} else if (
|
4380
|
+
tokenizer_pre == "mpt") {
|
4381
|
+
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_MPT;
|
4382
|
+
} else if (
|
4383
|
+
tokenizer_pre == "starcoder") {
|
4384
|
+
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_STARCODER;
|
4385
|
+
} else if (
|
4386
|
+
tokenizer_pre == "gpt-2") {
|
4387
|
+
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_GPT2;
|
4388
|
+
} else if (
|
4389
|
+
tokenizer_pre == "refact") {
|
4390
|
+
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_REFACT;
|
4391
|
+
} else if (
|
4392
|
+
tokenizer_pre == "command-r") {
|
4393
|
+
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_COMMAND_R;
|
4394
|
+
} else if (
|
4395
|
+
tokenizer_pre == "qwen2") {
|
4396
|
+
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_QWEN2;
|
4397
|
+
} else if (
|
4398
|
+
tokenizer_pre == "olmo") {
|
4399
|
+
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_OLMO;
|
4400
|
+
} else if (
|
4401
|
+
tokenizer_pre == "dbrx") {
|
4402
|
+
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DBRX;
|
4403
|
+
} else {
|
4404
|
+
throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
|
4405
|
+
}
|
4279
4406
|
} else {
|
4280
|
-
|
4281
|
-
LLAMA_LOG_WARN("%s: using default tokenizer: 'llama'", __func__);
|
4282
|
-
|
4283
|
-
vocab.type = LLAMA_VOCAB_TYPE_SPM;
|
4407
|
+
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
|
4284
4408
|
}
|
4285
4409
|
}
|
4286
4410
|
|
@@ -5975,7 +6099,7 @@ static bool llm_load_tensors(
|
|
5975
6099
|
// Returns 0 on success, -1 on error, and -2 on cancellation via llama_progress_callback
|
5976
6100
|
static int llama_model_load(const std::string & fname, llama_model & model, llama_model_params & params) {
|
5977
6101
|
try {
|
5978
|
-
llama_model_loader ml(fname, params.use_mmap, params.kv_overrides);
|
6102
|
+
llama_model_loader ml(fname, params.use_mmap, params.check_tensors, params.kv_overrides);
|
5979
6103
|
|
5980
6104
|
model.hparams.vocab_only = params.vocab_only;
|
5981
6105
|
|
@@ -6013,6 +6137,7 @@ static int llama_model_load(const std::string & fname, llama_model & model, llam
|
|
6013
6137
|
|| !(
|
6014
6138
|
model.ftype == LLAMA_FTYPE_ALL_F32 ||
|
6015
6139
|
model.ftype == LLAMA_FTYPE_MOSTLY_F16 ||
|
6140
|
+
model.ftype == LLAMA_FTYPE_MOSTLY_BF16 ||
|
6016
6141
|
model.ftype == LLAMA_FTYPE_MOSTLY_Q4_0 ||
|
6017
6142
|
model.ftype == LLAMA_FTYPE_MOSTLY_Q4_1
|
6018
6143
|
)
|
@@ -6104,37 +6229,47 @@ static struct ggml_tensor * llm_build_inp_embd(
|
|
6104
6229
|
static void llm_build_kv_store(
|
6105
6230
|
struct ggml_context * ctx,
|
6106
6231
|
const llama_hparams & hparams,
|
6232
|
+
const llama_cparams & cparams,
|
6107
6233
|
const llama_kv_cache & kv,
|
6108
6234
|
struct ggml_cgraph * graph,
|
6109
6235
|
struct ggml_tensor * k_cur,
|
6110
6236
|
struct ggml_tensor * v_cur,
|
6111
|
-
int64_t n_ctx,
|
6112
6237
|
int32_t n_tokens,
|
6113
6238
|
int32_t kv_head,
|
6114
6239
|
const llm_build_cb & cb,
|
6115
6240
|
int64_t il) {
|
6241
|
+
const int64_t n_ctx = cparams.n_ctx;
|
6242
|
+
|
6116
6243
|
const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa();
|
6117
6244
|
const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa();
|
6118
6245
|
|
6119
6246
|
GGML_ASSERT(kv.size == n_ctx);
|
6120
6247
|
|
6121
|
-
// compute the transposed [n_tokens, n_embd] V matrix
|
6122
|
-
assert(v_cur->ne[0] == n_embd_v_gqa && v_cur->ne[1] == n_tokens);
|
6123
|
-
struct ggml_tensor * v_cur_t = ggml_transpose(ctx, v_cur);
|
6124
|
-
cb(v_cur_t, "v_cur_t", il);
|
6125
|
-
|
6126
6248
|
struct ggml_tensor * k_cache_view = ggml_view_1d(ctx, kv.k_l[il], n_tokens*n_embd_k_gqa,
|
6127
6249
|
(ggml_row_size(kv.k_l[il]->type, n_embd_k_gqa))*kv_head);
|
6128
6250
|
cb(k_cache_view, "k_cache_view", il);
|
6129
6251
|
|
6130
|
-
|
6131
|
-
|
6132
|
-
|
6252
|
+
// note: storing RoPE-ed version of K in the KV cache
|
6253
|
+
ggml_build_forward_expand(graph, ggml_cpy(ctx, k_cur, k_cache_view));
|
6254
|
+
|
6255
|
+
assert(v_cur->ne[0] == n_embd_v_gqa && v_cur->ne[1] == n_tokens);
|
6256
|
+
|
6257
|
+
struct ggml_tensor * v_cache_view = nullptr;
|
6258
|
+
|
6259
|
+
if (cparams.flash_attn) {
|
6260
|
+
v_cache_view = ggml_view_1d(ctx, kv.v_l[il], n_tokens*n_embd_v_gqa,
|
6261
|
+
(kv_head)*ggml_row_size(kv.v_l[il]->type, n_embd_v_gqa));
|
6262
|
+
} else {
|
6263
|
+
// note: the V cache is transposed when not using flash attention
|
6264
|
+
v_cache_view = ggml_view_2d(ctx, kv.v_l[il], n_tokens, n_embd_v_gqa,
|
6265
|
+
( n_ctx)*ggml_element_size(kv.v_l[il]),
|
6266
|
+
(kv_head)*ggml_element_size(kv.v_l[il]));
|
6267
|
+
|
6268
|
+
v_cur = ggml_transpose(ctx, v_cur);
|
6269
|
+
}
|
6133
6270
|
cb(v_cache_view, "v_cache_view", il);
|
6134
6271
|
|
6135
|
-
|
6136
|
-
ggml_build_forward_expand(graph, ggml_cpy(ctx, k_cur, k_cache_view));
|
6137
|
-
ggml_build_forward_expand(graph, ggml_cpy(ctx, v_cur_t, v_cache_view));
|
6272
|
+
ggml_build_forward_expand(graph, ggml_cpy(ctx, v_cur, v_cache_view));
|
6138
6273
|
}
|
6139
6274
|
|
6140
6275
|
static struct ggml_tensor * llm_build_norm(
|
@@ -6354,11 +6489,11 @@ static struct ggml_tensor * llm_build_moe_ffn(
|
|
6354
6489
|
return moe_out;
|
6355
6490
|
}
|
6356
6491
|
|
6357
|
-
// if max_alibi_bias > 0 then apply ALiBi
|
6358
6492
|
static struct ggml_tensor * llm_build_kqv(
|
6359
6493
|
struct ggml_context * ctx,
|
6360
6494
|
const llama_model & model,
|
6361
6495
|
const llama_hparams & hparams,
|
6496
|
+
const llama_cparams & cparams,
|
6362
6497
|
const llama_kv_cache & kv,
|
6363
6498
|
struct ggml_cgraph * graph,
|
6364
6499
|
struct ggml_tensor * wo,
|
@@ -6366,12 +6501,12 @@ static struct ggml_tensor * llm_build_kqv(
|
|
6366
6501
|
struct ggml_tensor * q_cur,
|
6367
6502
|
struct ggml_tensor * kq_mask,
|
6368
6503
|
struct ggml_tensor * kq_pos,
|
6369
|
-
int64_t n_ctx,
|
6370
6504
|
int32_t n_tokens,
|
6371
6505
|
int32_t n_kv,
|
6372
6506
|
float kq_scale,
|
6373
6507
|
const llm_build_cb & cb,
|
6374
6508
|
int il) {
|
6509
|
+
const int64_t n_ctx = cparams.n_ctx;
|
6375
6510
|
const int64_t n_head = hparams.n_head;
|
6376
6511
|
const int64_t n_head_kv = hparams.n_head_kv;
|
6377
6512
|
const int64_t n_embd_head_k = hparams.n_embd_head_k;
|
@@ -6389,71 +6524,99 @@ static struct ggml_tensor * llm_build_kqv(
|
|
6389
6524
|
0);
|
6390
6525
|
cb(k, "k", il);
|
6391
6526
|
|
6392
|
-
struct ggml_tensor *
|
6393
|
-
|
6527
|
+
struct ggml_tensor * cur;
|
6528
|
+
|
6529
|
+
if (cparams.flash_attn) {
|
6530
|
+
GGML_UNUSED(model);
|
6531
|
+
GGML_UNUSED(n_ctx);
|
6394
6532
|
|
6395
|
-
|
6396
|
-
//
|
6397
|
-
|
6398
|
-
ggml_mul_mat_set_prec(kq, GGML_PREC_F32);
|
6399
|
-
}
|
6533
|
+
// note: if this assert triggers, then some check has failed earlier
|
6534
|
+
// the idea is to detect during context creation that ALiBi would be used and disable Flash Attention
|
6535
|
+
GGML_ASSERT(kq_pos == nullptr && "ALiBi is not yet supported with Flash Attention");
|
6400
6536
|
|
6401
|
-
|
6402
|
-
|
6403
|
-
|
6404
|
-
|
6405
|
-
|
6406
|
-
|
6537
|
+
// split cached v into n_head heads (not transposed)
|
6538
|
+
struct ggml_tensor * v =
|
6539
|
+
ggml_view_3d(ctx, kv.v_l[il],
|
6540
|
+
n_embd_head_v, n_kv, n_head_kv,
|
6541
|
+
ggml_row_size(kv.v_l[il]->type, n_embd_k_gqa),
|
6542
|
+
ggml_row_size(kv.v_l[il]->type, n_embd_head_k),
|
6543
|
+
0);
|
6544
|
+
cb(v, "v", il);
|
6407
6545
|
|
6408
|
-
|
6409
|
-
//ggml_mul_mat_set_prec(kq, GGML_PREC_F32);
|
6546
|
+
cur = ggml_flash_attn_ext(ctx, q, k, v, kq_mask, kq_scale);
|
6410
6547
|
|
6411
|
-
|
6412
|
-
|
6413
|
-
|
6548
|
+
if (model.arch == LLM_ARCH_PHI2 || model.arch == LLM_ARCH_PHI3) {
|
6549
|
+
ggml_flash_attn_ext_set_prec(cur, GGML_PREC_F32);
|
6550
|
+
}
|
6551
|
+
|
6552
|
+
cur = ggml_reshape_2d(ctx, cur, n_embd_head_k*n_head, n_tokens);
|
6553
|
+
} else {
|
6554
|
+
struct ggml_tensor * kq = ggml_mul_mat(ctx, k, q);
|
6555
|
+
cb(kq, "kq", il);
|
6556
|
+
|
6557
|
+
if (model.arch == LLM_ARCH_PHI2 || model.arch == LLM_ARCH_PHI3) {
|
6558
|
+
// for this arch, we need to perform the KQ multiplication with F32 precision, otherwise we get NaNs
|
6559
|
+
// ref: https://github.com/ggerganov/llama.cpp/pull/4490#issuecomment-1859055847
|
6560
|
+
ggml_mul_mat_set_prec(kq, GGML_PREC_F32);
|
6561
|
+
}
|
6562
|
+
|
6563
|
+
if (model.arch == LLM_ARCH_GROK) {
|
6564
|
+
// need to do the following:
|
6565
|
+
// multiply by attn_output_multiplyer of 0.08838834764831845
|
6566
|
+
// and then :
|
6567
|
+
// kq = 30 * tanh(kq / 30)
|
6568
|
+
// before the softmax below
|
6569
|
+
|
6570
|
+
//try from phi2
|
6571
|
+
//ggml_mul_mat_set_prec(kq, GGML_PREC_F32);
|
6572
|
+
|
6573
|
+
kq = ggml_tanh(ctx, ggml_scale(ctx, kq, 0.08838834764831845f/30.0f));
|
6574
|
+
kq = ggml_scale(ctx, kq, 30);
|
6575
|
+
}
|
6414
6576
|
|
6415
6577
|
#if defined(GGML_USE_KOMPUTE)
|
6416
6578
|
#pragma message("TODO: ALiBi support in ggml_soft_max_ext is not implemented for Kompute")
|
6417
6579
|
#pragma message(" Falling back to ggml_alibi(). Will become an error in Mar 2024")
|
6418
6580
|
#pragma message("ref: https://github.com/ggerganov/llama.cpp/pull/5488")
|
6419
|
-
|
6420
|
-
|
6421
|
-
|
6581
|
+
if (hparams.use_alibi) {
|
6582
|
+
kq = ggml_scale(ctx, kq, kq_scale);
|
6583
|
+
cb(kq, "kq_scaled", il);
|
6422
6584
|
|
6423
|
-
|
6424
|
-
|
6585
|
+
kq = ggml_alibi(ctx, kq, /*n_past*/ 0, n_head, hparams.f_max_alibi_bias);
|
6586
|
+
cb(kq, "kq_scaled_alibi", il);
|
6425
6587
|
|
6426
|
-
|
6427
|
-
|
6588
|
+
kq = ggml_add(ctx, kq, kq_mask);
|
6589
|
+
cb(kq, "kq_masked", il);
|
6428
6590
|
|
6429
|
-
|
6430
|
-
|
6431
|
-
|
6591
|
+
kq = ggml_soft_max(ctx, kq);
|
6592
|
+
cb(kq, "kq_soft_max", il);
|
6593
|
+
} else
|
6432
6594
|
#endif
|
6433
|
-
|
6434
|
-
|
6435
|
-
|
6436
|
-
|
6595
|
+
{
|
6596
|
+
kq = ggml_soft_max_ext(ctx, kq, kq_mask, kq_pos, kq_scale, hparams.f_max_alibi_bias);
|
6597
|
+
cb(kq, "kq_soft_max_ext", il);
|
6598
|
+
}
|
6437
6599
|
|
6438
|
-
|
6600
|
+
GGML_ASSERT(kv.size == n_ctx);
|
6439
6601
|
|
6440
|
-
|
6441
|
-
|
6442
|
-
|
6443
|
-
|
6444
|
-
|
6445
|
-
|
6446
|
-
|
6447
|
-
|
6602
|
+
// split cached v into n_head heads
|
6603
|
+
struct ggml_tensor * v =
|
6604
|
+
ggml_view_3d(ctx, kv.v_l[il],
|
6605
|
+
n_kv, n_embd_head_v, n_head_kv,
|
6606
|
+
ggml_element_size(kv.v_l[il])*n_ctx,
|
6607
|
+
ggml_element_size(kv.v_l[il])*n_ctx*n_embd_head_v,
|
6608
|
+
0);
|
6609
|
+
cb(v, "v", il);
|
6448
6610
|
|
6449
|
-
|
6450
|
-
|
6611
|
+
struct ggml_tensor * kqv = ggml_mul_mat(ctx, v, kq);
|
6612
|
+
cb(kqv, "kqv", il);
|
6451
6613
|
|
6452
|
-
|
6453
|
-
|
6614
|
+
struct ggml_tensor * kqv_merged = ggml_permute(ctx, kqv, 0, 2, 1, 3);
|
6615
|
+
cb(kqv_merged, "kqv_merged", il);
|
6454
6616
|
|
6455
|
-
|
6456
|
-
|
6617
|
+
cur = ggml_cont_2d(ctx, kqv_merged, n_embd_head_k*n_head, n_tokens);
|
6618
|
+
cb(cur, "kqv_merged_cont", il);
|
6619
|
+
}
|
6457
6620
|
|
6458
6621
|
ggml_build_forward_expand(graph, cur);
|
6459
6622
|
|
@@ -6473,6 +6636,7 @@ static struct ggml_tensor * llm_build_kv(
|
|
6473
6636
|
struct ggml_context * ctx,
|
6474
6637
|
const llama_model & model,
|
6475
6638
|
const llama_hparams & hparams,
|
6639
|
+
const llama_cparams & cparams,
|
6476
6640
|
const llama_kv_cache & kv,
|
6477
6641
|
struct ggml_cgraph * graph,
|
6478
6642
|
struct ggml_tensor * wo,
|
@@ -6482,7 +6646,6 @@ static struct ggml_tensor * llm_build_kv(
|
|
6482
6646
|
struct ggml_tensor * q_cur,
|
6483
6647
|
struct ggml_tensor * kq_mask,
|
6484
6648
|
struct ggml_tensor * kq_pos,
|
6485
|
-
int64_t n_ctx,
|
6486
6649
|
int32_t n_tokens,
|
6487
6650
|
int32_t kv_head,
|
6488
6651
|
int32_t n_kv,
|
@@ -6496,12 +6659,12 @@ static struct ggml_tensor * llm_build_kv(
|
|
6496
6659
|
ggml_build_forward_expand(graph, k_cur);
|
6497
6660
|
ggml_build_forward_expand(graph, v_cur);
|
6498
6661
|
|
6499
|
-
llm_build_kv_store(ctx, hparams, kv, graph, k_cur, v_cur,
|
6662
|
+
llm_build_kv_store(ctx, hparams, cparams, kv, graph, k_cur, v_cur, n_tokens, kv_head, cb, il);
|
6500
6663
|
|
6501
6664
|
struct ggml_tensor * cur;
|
6502
6665
|
|
6503
|
-
cur = llm_build_kqv(ctx, model, hparams, kv, graph, wo, wo_b,
|
6504
|
-
q_cur, kq_mask, kq_pos,
|
6666
|
+
cur = llm_build_kqv(ctx, model, hparams, cparams, kv, graph, wo, wo_b,
|
6667
|
+
q_cur, kq_mask, kq_pos, n_tokens, n_kv, kq_scale, cb, il);
|
6505
6668
|
cb(cur, "kqv_out", il);
|
6506
6669
|
|
6507
6670
|
return cur;
|
@@ -6543,6 +6706,8 @@ struct llm_build_context {
|
|
6543
6706
|
const int32_t kv_head; // index of where we store new KV data in the cache
|
6544
6707
|
const int32_t n_orig_ctx;
|
6545
6708
|
|
6709
|
+
const bool flash_attn;
|
6710
|
+
|
6546
6711
|
const enum llama_pooling_type pooling_type;
|
6547
6712
|
const enum llama_rope_type rope_type;
|
6548
6713
|
|
@@ -6589,6 +6754,7 @@ struct llm_build_context {
|
|
6589
6754
|
n_outputs (worst_case ? n_tokens : lctx.n_outputs),
|
6590
6755
|
kv_head (worst_case ? (kv_self.recurrent ? 0 : kv_self.size - n_tokens) : kv_self.head),
|
6591
6756
|
n_orig_ctx (cparams.n_yarn_orig_ctx),
|
6757
|
+
flash_attn (cparams.flash_attn),
|
6592
6758
|
pooling_type (cparams.pooling_type),
|
6593
6759
|
rope_type (hparams.rope_type),
|
6594
6760
|
cb (cb),
|
@@ -6703,15 +6869,31 @@ struct llm_build_context {
|
|
6703
6869
|
ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa),
|
6704
6870
|
ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*id));
|
6705
6871
|
|
6706
|
-
ggml_tensor * view_v_src
|
6707
|
-
|
6708
|
-
|
6709
|
-
|
6872
|
+
ggml_tensor * view_v_src;
|
6873
|
+
ggml_tensor * view_v_dst;
|
6874
|
+
|
6875
|
+
if (flash_attn) {
|
6876
|
+
// NOTE: the V cache is not transposed when using flash attention
|
6877
|
+
view_v_src = ggml_view_2d(ctx0, kv_self.v_l[il],
|
6878
|
+
n_embd_v_gqa, nm,
|
6879
|
+
ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa),
|
6880
|
+
ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa*i));
|
6710
6881
|
|
6711
|
-
|
6712
|
-
|
6713
|
-
|
6714
|
-
|
6882
|
+
view_v_dst = ggml_view_2d(ctx0, kv_self.v_l[il],
|
6883
|
+
n_embd_v_gqa, nm,
|
6884
|
+
ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa),
|
6885
|
+
ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa*id));
|
6886
|
+
} else {
|
6887
|
+
view_v_src = ggml_view_2d(ctx0, kv_self.v_l[il],
|
6888
|
+
nm, n_embd_v_gqa,
|
6889
|
+
ggml_row_size(kv_self.v_l[il]->type, kv_self.size),
|
6890
|
+
ggml_row_size(kv_self.v_l[il]->type, i));
|
6891
|
+
|
6892
|
+
view_v_dst = ggml_view_2d(ctx0, kv_self.v_l[il],
|
6893
|
+
nm, n_embd_v_gqa,
|
6894
|
+
ggml_row_size(kv_self.v_l[il]->type, kv_self.size),
|
6895
|
+
ggml_row_size(kv_self.v_l[il]->type, id));
|
6896
|
+
}
|
6715
6897
|
|
6716
6898
|
ggml_build_forward_expand(gf, ggml_cpy(ctx0, view_k_src, view_k_dst));
|
6717
6899
|
ggml_build_forward_expand(gf, ggml_cpy(ctx0, view_v_src, view_v_dst));
|
@@ -6741,20 +6923,26 @@ struct llm_build_context {
|
|
6741
6923
|
|
6742
6924
|
struct ggml_tensor * build_inp_KQ_mask(bool causal = true) {
|
6743
6925
|
if (causal) {
|
6744
|
-
lctx.inp_KQ_mask = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_kv,
|
6926
|
+
lctx.inp_KQ_mask = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_kv, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD));
|
6745
6927
|
} else {
|
6746
|
-
lctx.inp_KQ_mask = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_tokens, n_tokens);
|
6928
|
+
lctx.inp_KQ_mask = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_tokens, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD));
|
6747
6929
|
}
|
6748
6930
|
cb(lctx.inp_KQ_mask, "KQ_mask", -1);
|
6749
6931
|
ggml_set_input(lctx.inp_KQ_mask);
|
6750
|
-
return lctx.inp_KQ_mask;
|
6932
|
+
return flash_attn ? ggml_cast(ctx0, lctx.inp_KQ_mask, GGML_TYPE_F16) : lctx.inp_KQ_mask;
|
6751
6933
|
}
|
6752
6934
|
|
6753
|
-
struct ggml_tensor * build_inp_KQ_pos() {
|
6754
|
-
|
6935
|
+
struct ggml_tensor * build_inp_KQ_pos(bool causal = true) {
|
6936
|
+
if (causal) {
|
6937
|
+
lctx.inp_KQ_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, n_kv);
|
6938
|
+
} else {
|
6939
|
+
// TODO: this will be needed for ALiBi-based BERT models
|
6940
|
+
// https://github.com/ggerganov/llama.cpp/pull/6826
|
6941
|
+
lctx.inp_KQ_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, n_tokens);
|
6942
|
+
}
|
6755
6943
|
cb(lctx.inp_KQ_pos, "KQ_pos", -1);
|
6756
6944
|
ggml_set_input(lctx.inp_KQ_pos);
|
6757
|
-
return lctx.inp_KQ_pos;
|
6945
|
+
return flash_attn ? ggml_cast(ctx0, lctx.inp_KQ_pos, GGML_TYPE_F16) : lctx.inp_KQ_pos;
|
6758
6946
|
}
|
6759
6947
|
|
6760
6948
|
struct ggml_tensor * build_inp_mean() {
|
@@ -6860,9 +7048,9 @@ struct llm_build_context {
|
|
6860
7048
|
);
|
6861
7049
|
cb(Kcur, "Kcur", il);
|
6862
7050
|
|
6863
|
-
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
7051
|
+
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
6864
7052
|
model.layers[il].wo, model.layers[il].bo,
|
6865
|
-
Kcur, Vcur, Qcur, KQ_mask, nullptr,
|
7053
|
+
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
6866
7054
|
}
|
6867
7055
|
|
6868
7056
|
if (il == n_layer - 1) {
|
@@ -7000,9 +7188,9 @@ struct llm_build_context {
|
|
7000
7188
|
cb(Qcur, "Qcur", il);
|
7001
7189
|
cb(Kcur, "Kcur", il);
|
7002
7190
|
|
7003
|
-
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
7191
|
+
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
7004
7192
|
model.layers[il].wo, NULL,
|
7005
|
-
Kcur, Vcur, Qcur, KQ_mask, KQ_pos,
|
7193
|
+
Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
7006
7194
|
}
|
7007
7195
|
|
7008
7196
|
if (il == n_layer - 1) {
|
@@ -7107,9 +7295,9 @@ struct llm_build_context {
|
|
7107
7295
|
ext_factor, attn_factor, beta_fast, beta_slow
|
7108
7296
|
);
|
7109
7297
|
cb(Kcur, "Kcur", il);
|
7110
|
-
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
7298
|
+
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
7111
7299
|
model.layers[il].wo, NULL,
|
7112
|
-
Kcur, Vcur, Qcur, KQ_mask, KQ_pos,
|
7300
|
+
Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
7113
7301
|
}
|
7114
7302
|
|
7115
7303
|
if (il == n_layer - 1) {
|
@@ -7227,9 +7415,9 @@ struct llm_build_context {
|
|
7227
7415
|
);
|
7228
7416
|
cb(Kcur, "Kcur", il);
|
7229
7417
|
|
7230
|
-
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
7418
|
+
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
7231
7419
|
model.layers[il].wo, NULL,
|
7232
|
-
Kcur, Vcur, Qcur, KQ_mask, nullptr,
|
7420
|
+
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
7233
7421
|
}
|
7234
7422
|
|
7235
7423
|
if (il == n_layer - 1) {
|
@@ -7352,9 +7540,9 @@ struct llm_build_context {
|
|
7352
7540
|
);
|
7353
7541
|
cb(Kcur, "Kcur", il);
|
7354
7542
|
|
7355
|
-
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
7543
|
+
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
7356
7544
|
model.layers[il].wo, model.layers[il].bo,
|
7357
|
-
Kcur, Vcur, Qcur, KQ_mask, nullptr,
|
7545
|
+
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f, cb, il);
|
7358
7546
|
}
|
7359
7547
|
|
7360
7548
|
if (il == n_layer - 1) {
|
@@ -7504,9 +7692,9 @@ struct llm_build_context {
|
|
7504
7692
|
);
|
7505
7693
|
cb(Kcur, "Kcur", il);
|
7506
7694
|
|
7507
|
-
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
7508
|
-
|
7509
|
-
|
7695
|
+
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
7696
|
+
model.layers[il].wo, NULL,
|
7697
|
+
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
7510
7698
|
}
|
7511
7699
|
|
7512
7700
|
if (il == n_layer - 1) {
|
@@ -7616,9 +7804,9 @@ struct llm_build_context {
|
|
7616
7804
|
|
7617
7805
|
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
7618
7806
|
|
7619
|
-
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
7807
|
+
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
7620
7808
|
model.layers[il].wo, model.layers[il].bo,
|
7621
|
-
Kcur, Vcur, Qcur, KQ_mask, nullptr,
|
7809
|
+
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
7622
7810
|
}
|
7623
7811
|
|
7624
7812
|
if (il == n_layer - 1) {
|
@@ -7820,9 +8008,9 @@ struct llm_build_context {
|
|
7820
8008
|
);
|
7821
8009
|
cb(Vcur, "Vcur", il);
|
7822
8010
|
|
7823
|
-
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
8011
|
+
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
7824
8012
|
model.layers[il].wo, model.layers[il].bo,
|
7825
|
-
Kcur, Vcur, Q, KQ_mask, nullptr,
|
8013
|
+
Kcur, Vcur, Q, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
7826
8014
|
}
|
7827
8015
|
|
7828
8016
|
if (il == n_layer - 1) {
|
@@ -7916,9 +8104,9 @@ struct llm_build_context {
|
|
7916
8104
|
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
7917
8105
|
cb(Qcur, "Qcur", il);
|
7918
8106
|
|
7919
|
-
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
8107
|
+
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
7920
8108
|
model.layers[il].wo, NULL,
|
7921
|
-
Kcur, Vcur, Qcur, KQ_mask, KQ_pos,
|
8109
|
+
Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
7922
8110
|
}
|
7923
8111
|
|
7924
8112
|
if (il == n_layer - 1) {
|
@@ -8209,9 +8397,9 @@ struct llm_build_context {
|
|
8209
8397
|
|
8210
8398
|
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
8211
8399
|
|
8212
|
-
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
8400
|
+
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
8213
8401
|
model.layers[il].wo, model.layers[il].bo,
|
8214
|
-
Kcur, Vcur, Qcur, KQ_mask, KQ_pos,
|
8402
|
+
Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
8215
8403
|
}
|
8216
8404
|
|
8217
8405
|
if (il == n_layer - 1) {
|
@@ -8340,14 +8528,15 @@ struct llm_build_context {
|
|
8340
8528
|
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
8341
8529
|
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
8342
8530
|
|
8343
|
-
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
8344
|
-
|
8345
|
-
|
8531
|
+
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
8532
|
+
model.layers[il].wo, model.layers[il].bo,
|
8533
|
+
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
8346
8534
|
} else {
|
8347
8535
|
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
8348
|
-
|
8536
|
+
|
8537
|
+
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
8349
8538
|
model.layers[il].wo, model.layers[il].bo,
|
8350
|
-
Kcur, Vcur, Qcur, KQ_mask, KQ_pos,
|
8539
|
+
Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
8351
8540
|
}
|
8352
8541
|
}
|
8353
8542
|
|
@@ -8489,9 +8678,9 @@ struct llm_build_context {
|
|
8489
8678
|
);
|
8490
8679
|
cb(Kcur, "Kcur", il);
|
8491
8680
|
|
8492
|
-
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
8681
|
+
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
8493
8682
|
model.layers[il].wo, NULL,
|
8494
|
-
Kcur, Vcur, Qcur, KQ_mask, nullptr,
|
8683
|
+
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
8495
8684
|
}
|
8496
8685
|
|
8497
8686
|
if (il == n_layer - 1) {
|
@@ -8607,9 +8796,9 @@ struct llm_build_context {
|
|
8607
8796
|
);
|
8608
8797
|
cb(Kcur, "Kcur", il);
|
8609
8798
|
|
8610
|
-
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
8799
|
+
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
8611
8800
|
model.layers[il].wo, NULL,
|
8612
|
-
Kcur, Vcur, Qcur, KQ_mask, nullptr,
|
8801
|
+
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
8613
8802
|
}
|
8614
8803
|
|
8615
8804
|
if (il == n_layer - 1) {
|
@@ -8720,9 +8909,9 @@ struct llm_build_context {
|
|
8720
8909
|
);
|
8721
8910
|
cb(Kcur, "Kcur", il);
|
8722
8911
|
|
8723
|
-
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
8912
|
+
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
8724
8913
|
model.layers[il].wo, model.layers[il].bo,
|
8725
|
-
Kcur, Vcur, Qcur, KQ_mask, nullptr,
|
8914
|
+
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
8726
8915
|
}
|
8727
8916
|
|
8728
8917
|
if (il == n_layer - 1) {
|
@@ -8834,9 +9023,9 @@ struct llm_build_context {
|
|
8834
9023
|
);
|
8835
9024
|
cb(Kcur, "Kcur", il);
|
8836
9025
|
|
8837
|
-
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
9026
|
+
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
8838
9027
|
model.layers[il].wo, model.layers[il].bo,
|
8839
|
-
Kcur, Vcur, Qcur, KQ_mask, nullptr,
|
9028
|
+
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
8840
9029
|
}
|
8841
9030
|
|
8842
9031
|
if (il == n_layer - 1) {
|
@@ -8989,9 +9178,9 @@ struct llm_build_context {
|
|
8989
9178
|
);
|
8990
9179
|
cb(Kcur, "Kcur", il);
|
8991
9180
|
|
8992
|
-
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
9181
|
+
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
8993
9182
|
model.layers[il].wo, model.layers[il].bo,
|
8994
|
-
Kcur, Vcur, Qcur, KQ_mask, nullptr,
|
9183
|
+
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f, cb, il);
|
8995
9184
|
}
|
8996
9185
|
|
8997
9186
|
if (il == n_layer - 1) {
|
@@ -9106,9 +9295,9 @@ struct llm_build_context {
|
|
9106
9295
|
);
|
9107
9296
|
cb(Kcur, "Kcur", il);
|
9108
9297
|
|
9109
|
-
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
9110
|
-
|
9111
|
-
|
9298
|
+
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
9299
|
+
model.layers[il].wo, model.layers[il].bo,
|
9300
|
+
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f, cb, il);
|
9112
9301
|
}
|
9113
9302
|
|
9114
9303
|
if (il == n_layer - 1) {
|
@@ -9219,9 +9408,9 @@ struct llm_build_context {
|
|
9219
9408
|
ext_factor, attn_factor, beta_fast, beta_slow);
|
9220
9409
|
cb(Kcur, "Kcur", il);
|
9221
9410
|
|
9222
|
-
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
9411
|
+
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
9223
9412
|
model.layers[il].wo, NULL,
|
9224
|
-
Kcur, Vcur, Qcur, KQ_mask, nullptr,
|
9413
|
+
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
9225
9414
|
}
|
9226
9415
|
struct ggml_tensor * sa_out = cur;
|
9227
9416
|
|
@@ -9322,9 +9511,9 @@ struct llm_build_context {
|
|
9322
9511
|
|
9323
9512
|
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
9324
9513
|
|
9325
|
-
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
9514
|
+
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
9326
9515
|
model.layers[il].wo, model.layers[il].bo,
|
9327
|
-
Kcur, Vcur, Qcur, KQ_mask, nullptr,
|
9516
|
+
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
9328
9517
|
}
|
9329
9518
|
|
9330
9519
|
if (il == n_layer - 1) {
|
@@ -9429,9 +9618,9 @@ struct llm_build_context {
|
|
9429
9618
|
);
|
9430
9619
|
cb(Kcur, "Kcur", il);
|
9431
9620
|
|
9432
|
-
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
9621
|
+
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
9433
9622
|
model.layers[il].wo, model.layers[il].bo,
|
9434
|
-
Kcur, Vcur, Qcur, KQ_mask, nullptr,
|
9623
|
+
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
9435
9624
|
}
|
9436
9625
|
|
9437
9626
|
if (il == n_layer - 1) {
|
@@ -9545,9 +9734,9 @@ struct llm_build_context {
|
|
9545
9734
|
);
|
9546
9735
|
cb(Kcur, "Kcur", il);
|
9547
9736
|
|
9548
|
-
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
9737
|
+
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
9549
9738
|
model.layers[il].wo, NULL,
|
9550
|
-
Kcur, Vcur, Qcur, KQ_mask, nullptr,
|
9739
|
+
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
9551
9740
|
}
|
9552
9741
|
|
9553
9742
|
if (il == n_layer - 1) {
|
@@ -9662,9 +9851,9 @@ struct llm_build_context {
|
|
9662
9851
|
);
|
9663
9852
|
cb(Kcur, "Kcur", il);
|
9664
9853
|
|
9665
|
-
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
9854
|
+
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
9666
9855
|
model.layers[il].wo, model.layers[il].bo,
|
9667
|
-
Kcur, Vcur, Qcur, KQ_mask, nullptr,
|
9856
|
+
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
9668
9857
|
}
|
9669
9858
|
|
9670
9859
|
if (il == n_layer - 1) {
|
@@ -9792,9 +9981,9 @@ struct llm_build_context {
|
|
9792
9981
|
);
|
9793
9982
|
cb(Kcur, "Kcur", il);
|
9794
9983
|
|
9795
|
-
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
9984
|
+
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
9796
9985
|
model.layers[il].wo, model.layers[il].bo,
|
9797
|
-
Kcur, Vcur, Qcur, KQ_mask, nullptr,
|
9986
|
+
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
9798
9987
|
}
|
9799
9988
|
|
9800
9989
|
if (il == n_layer - 1) {
|
@@ -9913,9 +10102,9 @@ struct llm_build_context {
|
|
9913
10102
|
ext_factor, attn_factor, beta_fast, beta_slow);
|
9914
10103
|
cb(Kcur, "Kcur", il);
|
9915
10104
|
|
9916
|
-
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
10105
|
+
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
9917
10106
|
model.layers[il].wo, NULL,
|
9918
|
-
Kcur, Vcur, Qcur, KQ_mask, nullptr,
|
10107
|
+
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f, cb, il);
|
9919
10108
|
}
|
9920
10109
|
|
9921
10110
|
if (il == n_layer - 1) {
|
@@ -10032,9 +10221,9 @@ struct llm_build_context {
|
|
10032
10221
|
);
|
10033
10222
|
cb(Kcur, "Kcur", il);
|
10034
10223
|
|
10035
|
-
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
10224
|
+
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
10036
10225
|
model.layers[il].wo, model.layers[il].bo,
|
10037
|
-
Kcur, Vcur, Qcur, KQ_mask, nullptr,
|
10226
|
+
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
10038
10227
|
}
|
10039
10228
|
|
10040
10229
|
if (il == n_layer - 1) {
|
@@ -10322,9 +10511,9 @@ struct llm_build_context {
|
|
10322
10511
|
);
|
10323
10512
|
cb(Kcur, "Kcur", il);
|
10324
10513
|
|
10325
|
-
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
10514
|
+
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
10326
10515
|
model.layers[il].wo, model.layers[il].bo,
|
10327
|
-
Kcur, Vcur, Qcur, KQ_mask, nullptr,
|
10516
|
+
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
10328
10517
|
}
|
10329
10518
|
|
10330
10519
|
if (il == n_layer - 1) {
|
@@ -10453,9 +10642,9 @@ struct llm_build_context {
|
|
10453
10642
|
);
|
10454
10643
|
cb(Kcur, "Kcur", il);
|
10455
10644
|
|
10456
|
-
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
10645
|
+
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
10457
10646
|
model.layers[il].wo, nullptr,
|
10458
|
-
Kcur, Vcur, Qcur, KQ_mask, nullptr,
|
10647
|
+
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
10459
10648
|
}
|
10460
10649
|
|
10461
10650
|
if (il == n_layer - 1) {
|
@@ -10882,7 +11071,9 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
|
|
10882
11071
|
}
|
10883
11072
|
}
|
10884
11073
|
|
10885
|
-
|
11074
|
+
// ALiBi requires the KQ_pos tensor to provide the sequence position of each token in the batch
|
11075
|
+
// this allows to process multiple sequences in parallel with ALiBi-based models
|
11076
|
+
if (hparams.use_alibi) {
|
10886
11077
|
const int64_t n_kv = kv_self.n;
|
10887
11078
|
|
10888
11079
|
GGML_ASSERT(lctx.inp_KQ_pos);
|
@@ -11264,7 +11455,7 @@ static int llama_decode_internal(
|
|
11264
11455
|
// a heuristic, to avoid attending the full cache if it is not yet utilized
|
11265
11456
|
// after enough generations, the benefit from this heuristic disappears
|
11266
11457
|
// if we start defragmenting the cache, the benefit from this will be more important
|
11267
|
-
kv_self.n = std::min(kv_self.size, std::max(
|
11458
|
+
kv_self.n = std::min(kv_self.size, std::max(256u, GGML_PAD(llama_kv_cache_cell_max(kv_self), 256)));
|
11268
11459
|
//kv_self.n = llama_kv_cache_cell_max(kv_self);
|
11269
11460
|
}
|
11270
11461
|
}
|
@@ -11432,6 +11623,10 @@ static int llama_decode_internal(
|
|
11432
11623
|
}
|
11433
11624
|
}
|
11434
11625
|
|
11626
|
+
// Reset state for the next token before backend sync, to allow the CPU activities in the reset to
|
11627
|
+
// overlap with device computation.
|
11628
|
+
ggml_backend_sched_reset(lctx.sched);
|
11629
|
+
|
11435
11630
|
return 0;
|
11436
11631
|
}
|
11437
11632
|
|
@@ -11457,7 +11652,9 @@ static void llama_kv_cache_defrag_internal(struct llama_context & lctx) {
|
|
11457
11652
|
// each move requires 6*n_layer tensors (see build_defrag)
|
11458
11653
|
// - source view, destination view, copy operation
|
11459
11654
|
// - x2 for keys and values
|
11460
|
-
const uint32_t max_moves = LLAMA_MAX_NODES/(6*n_layer);
|
11655
|
+
//const uint32_t max_moves = LLAMA_MAX_NODES/(6*n_layer);
|
11656
|
+
// TODO: tmp fix https://github.com/ggerganov/llama.cpp/issues/6685#issuecomment-2057579516
|
11657
|
+
const uint32_t max_moves = (LLAMA_MAX_NODES - 2*n_layer)/(6*n_layer);
|
11461
11658
|
|
11462
11659
|
// determine which KV cells to move where
|
11463
11660
|
//
|
@@ -11773,7 +11970,7 @@ static bool llama_is_user_defined_token(const llama_vocab& vocab, llama_token id
|
|
11773
11970
|
static uint8_t llama_token_to_byte(const llama_vocab& vocab, llama_token id) {
|
11774
11971
|
GGML_ASSERT(llama_vocab_get_type(vocab) != LLAMA_VOCAB_TYPE_NONE);
|
11775
11972
|
GGML_ASSERT(llama_is_byte_token(vocab, id));
|
11776
|
-
const auto& token_data = vocab.id_to_token.at(id);
|
11973
|
+
const auto & token_data = vocab.id_to_token.at(id);
|
11777
11974
|
switch (llama_vocab_get_type(vocab)) {
|
11778
11975
|
case LLAMA_VOCAB_TYPE_SPM: {
|
11779
11976
|
auto buf = token_data.text.substr(3, 2);
|
@@ -11781,7 +11978,7 @@ static uint8_t llama_token_to_byte(const llama_vocab& vocab, llama_token id) {
|
|
11781
11978
|
}
|
11782
11979
|
case LLAMA_VOCAB_TYPE_BPE: {
|
11783
11980
|
GGML_ASSERT(false);
|
11784
|
-
return unicode_utf8_to_byte(token_data.text);
|
11981
|
+
return unicode_utf8_to_byte(token_data.text); // TODO: why is this here after GGML_ASSERT?
|
11785
11982
|
}
|
11786
11983
|
case LLAMA_VOCAB_TYPE_WPM: {
|
11787
11984
|
GGML_ASSERT(false);
|
@@ -12003,7 +12200,94 @@ struct llm_tokenizer_bpe {
|
|
12003
12200
|
|
12004
12201
|
void tokenize(const std::string & text, std::vector<llama_vocab::id> & output) {
|
12005
12202
|
int final_prev_index = -1;
|
12006
|
-
|
12203
|
+
|
12204
|
+
std::vector<std::string> word_collection;
|
12205
|
+
switch (vocab.type) {
|
12206
|
+
case LLAMA_VOCAB_TYPE_BPE:
|
12207
|
+
switch (vocab.type_pre) {
|
12208
|
+
case LLAMA_VOCAB_PRE_TYPE_LLAMA3:
|
12209
|
+
case LLAMA_VOCAB_PRE_TYPE_DBRX:
|
12210
|
+
word_collection = unicode_regex_split(text, {
|
12211
|
+
// original regex from tokenizer.json
|
12212
|
+
//"(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
|
12213
|
+
|
12214
|
+
// adapted: https://github.com/ggerganov/llama.cpp/pull/6920#issuecomment-2080233989
|
12215
|
+
"(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
|
12216
|
+
});
|
12217
|
+
break;
|
12218
|
+
case LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM:
|
12219
|
+
word_collection = unicode_regex_split(text, {
|
12220
|
+
"[\r\n]",
|
12221
|
+
"\\s?[A-Za-zµÀ-ÖØ-öø-ƺƼ-ƿDŽ-ʓʕ-ʯͰ-ͳͶͷͻ-ͽͿΆΈ-ΊΌΎ-ΡΣ-ϵϷ-ҁҊ-ԯԱ-ՖႠ-ჅᎠ-Ᏽᏸ-ᏽᲐ-ᲺᲽ-Ჿᴀ-ᴫᵫ-ᵷᵹ-ᶚḀ-ἕἘ-Ἕἠ-ὅὈ-Ὅὐ-ὗὙὛὝὟ-ώᾀ-ᾴᾶ-ᾼιῂ-ῄῆ-ῌῐ-ΐῖ-Ίῠ-Ῥῲ-ῴῶ-ῼℂℇℊ-ℓℕℙ-ℝℤΩℨK-ℭℯ-ℴℹℼ-ℿⅅ-ⅉⅎↃↄⰀ-ⱻⱾ-ⳤⳫ-ⳮⳲⳳꙀ-ꙭꚀ-ꚛꜢ-ꝯꝱ-ꞇꞋ-ꞎꭰ-ꮿff-stﬓ-ﬗA-Za-z𐐀-𐑏𐒰-𐓓𐓘-𐓻𐲀-𐲲𐳀-𐳲𑢠-𑣟𞤀-𞥃]+",
|
12222
|
+
"\\s?[!-/:-~!-/:-~‘-‟ -。]+",
|
12223
|
+
"\\s+$",
|
12224
|
+
"[一-龥ࠀ-一가-]+",
|
12225
|
+
"\\p{N}+",
|
12226
|
+
});
|
12227
|
+
break;
|
12228
|
+
case LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER:
|
12229
|
+
word_collection = unicode_regex_split(text, {
|
12230
|
+
"[\r\n]",
|
12231
|
+
"\\s?\\p{L}+",
|
12232
|
+
"\\s?\\p{P}+",
|
12233
|
+
"[一-龥ࠀ-一가-]+",
|
12234
|
+
"\\p{N}",
|
12235
|
+
});
|
12236
|
+
break;
|
12237
|
+
case LLAMA_VOCAB_PRE_TYPE_FALCON:
|
12238
|
+
word_collection = unicode_regex_split(text, {
|
12239
|
+
"[\\p{P}\\$\\+<=>\\^~\\|]+",
|
12240
|
+
"'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
|
12241
|
+
"[0-9][0-9][0-9]",
|
12242
|
+
});
|
12243
|
+
break;
|
12244
|
+
case LLAMA_VOCAB_PRE_TYPE_MPT:
|
12245
|
+
// TODO: MPT pre-tokenization regexes are unknown
|
12246
|
+
// the following are close, but not exact. run the following:
|
12247
|
+
// ./bin/test-tokenizer-0 ../models/ggml-vocab-mpt.gguf
|
12248
|
+
GGML_ASSERT("MPT pre-tokenization regexes are unknown - fixes needed");
|
12249
|
+
word_collection = unicode_regex_split(text, {
|
12250
|
+
"\\s?\\p{L}+",
|
12251
|
+
"\\s?\\p{P}+",
|
12252
|
+
"'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
|
12253
|
+
});
|
12254
|
+
break;
|
12255
|
+
case LLAMA_VOCAB_PRE_TYPE_STARCODER:
|
12256
|
+
case LLAMA_VOCAB_PRE_TYPE_REFACT:
|
12257
|
+
case LLAMA_VOCAB_PRE_TYPE_COMMAND_R:
|
12258
|
+
word_collection = unicode_regex_split(text, {
|
12259
|
+
"\\p{N}",
|
12260
|
+
"'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
|
12261
|
+
});
|
12262
|
+
break;
|
12263
|
+
case LLAMA_VOCAB_PRE_TYPE_GPT2:
|
12264
|
+
case LLAMA_VOCAB_PRE_TYPE_OLMO:
|
12265
|
+
word_collection = unicode_regex_split(text, {
|
12266
|
+
"'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
|
12267
|
+
});
|
12268
|
+
break;
|
12269
|
+
case LLAMA_VOCAB_PRE_TYPE_QWEN2:
|
12270
|
+
word_collection = unicode_regex_split(text, {
|
12271
|
+
// original regex from tokenizer.json
|
12272
|
+
// "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+"
|
12273
|
+
"(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
|
12274
|
+
});
|
12275
|
+
break;
|
12276
|
+
default:
|
12277
|
+
// default regex for BPE tokenization pre-processing
|
12278
|
+
word_collection = unicode_regex_split(text, {
|
12279
|
+
"[\\p{P}\\$\\+<=>\\^~\\|]+",
|
12280
|
+
"'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
|
12281
|
+
"\\p{N}+",
|
12282
|
+
"[0-9][0-9][0-9]",
|
12283
|
+
});
|
12284
|
+
break;
|
12285
|
+
}
|
12286
|
+
break;
|
12287
|
+
default:
|
12288
|
+
GGML_ASSERT(false);
|
12289
|
+
break;
|
12290
|
+
}
|
12007
12291
|
|
12008
12292
|
symbols_final.clear();
|
12009
12293
|
|
@@ -12130,145 +12414,6 @@ private:
|
|
12130
12414
|
work_queue.push(bigram);
|
12131
12415
|
}
|
12132
12416
|
|
12133
|
-
std::vector<std::string> bpe_gpt2_preprocess(const std::string & text) {
|
12134
|
-
std::vector<std::string> bpe_words;
|
12135
|
-
std::vector<std::string> bpe_encoded_words;
|
12136
|
-
|
12137
|
-
std::string token = "";
|
12138
|
-
// GPT2 system regex: 's|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+
|
12139
|
-
bool collecting_numeric = false;
|
12140
|
-
bool collecting_letter = false;
|
12141
|
-
bool collecting_special = false;
|
12142
|
-
bool collecting_whitespace_lookahead = false;
|
12143
|
-
bool collecting = false;
|
12144
|
-
|
12145
|
-
std::vector<std::string> text_utf;
|
12146
|
-
text_utf.reserve(text.size());
|
12147
|
-
bpe_words.reserve(text.size());
|
12148
|
-
bpe_encoded_words.reserve(text.size());
|
12149
|
-
|
12150
|
-
const auto cpts = unicode_cpts_from_utf8(text);
|
12151
|
-
for (size_t i = 0; i < cpts.size(); ++i)
|
12152
|
-
text_utf.emplace_back(unicode_cpt_to_utf8(cpts[i]));
|
12153
|
-
|
12154
|
-
for (int i = 0; i < (int)text_utf.size(); i++) {
|
12155
|
-
const std::string & utf_char = text_utf[i];
|
12156
|
-
bool split_condition = false;
|
12157
|
-
int bytes_remain = text_utf.size() - i;
|
12158
|
-
// forward backward lookups
|
12159
|
-
const std::string & utf_char_next = (i + 1 < (int)text_utf.size()) ? text_utf[i + 1] : "";
|
12160
|
-
const std::string & utf_char_next_next = (i + 2 < (int)text_utf.size()) ? text_utf[i + 2] : "";
|
12161
|
-
|
12162
|
-
// handling contractions
|
12163
|
-
if (!split_condition && bytes_remain >= 2) {
|
12164
|
-
// 's|'t|'m|'d
|
12165
|
-
if (utf_char == "\'" && (utf_char_next == "s" || utf_char_next == "t" || utf_char_next == "m" || utf_char_next == "d")) {
|
12166
|
-
split_condition = true;
|
12167
|
-
}
|
12168
|
-
if (split_condition) {
|
12169
|
-
if (token.size()) {
|
12170
|
-
bpe_words.emplace_back(token); // push previous content as token
|
12171
|
-
}
|
12172
|
-
token = utf_char + utf_char_next;
|
12173
|
-
bpe_words.emplace_back(token);
|
12174
|
-
token = "";
|
12175
|
-
i++;
|
12176
|
-
continue;
|
12177
|
-
}
|
12178
|
-
}
|
12179
|
-
if (!split_condition && bytes_remain >= 3) {
|
12180
|
-
// 're|'ve|'ll
|
12181
|
-
if (utf_char == "\'" && (
|
12182
|
-
(utf_char_next == "r" && utf_char_next_next == "e") ||
|
12183
|
-
(utf_char_next == "v" && utf_char_next_next == "e") ||
|
12184
|
-
(utf_char_next == "l" && utf_char_next_next == "l"))
|
12185
|
-
) {
|
12186
|
-
split_condition = true;
|
12187
|
-
}
|
12188
|
-
if (split_condition) {
|
12189
|
-
// current token + next token can be defined
|
12190
|
-
if (token.size()) {
|
12191
|
-
bpe_words.emplace_back(token); // push previous content as token
|
12192
|
-
}
|
12193
|
-
token = utf_char + utf_char_next + utf_char_next_next;
|
12194
|
-
bpe_words.emplace_back(token); // the contraction
|
12195
|
-
token = "";
|
12196
|
-
i += 2;
|
12197
|
-
continue;
|
12198
|
-
}
|
12199
|
-
}
|
12200
|
-
|
12201
|
-
if (!split_condition && !collecting) {
|
12202
|
-
if (unicode_cpt_type(utf_char) == CODEPOINT_TYPE_LETTER || (!token.size() && utf_char == " " && unicode_cpt_type(utf_char_next) == CODEPOINT_TYPE_LETTER)) {
|
12203
|
-
collecting_letter = true;
|
12204
|
-
collecting = true;
|
12205
|
-
}
|
12206
|
-
else if (unicode_cpt_type(utf_char) == CODEPOINT_TYPE_DIGIT || (!token.size() && utf_char == " " && unicode_cpt_type(utf_char_next) == CODEPOINT_TYPE_DIGIT)) {
|
12207
|
-
collecting_numeric = true;
|
12208
|
-
collecting = true;
|
12209
|
-
}
|
12210
|
-
else if (
|
12211
|
-
((unicode_cpt_type(utf_char) != CODEPOINT_TYPE_LETTER && unicode_cpt_type(utf_char) != CODEPOINT_TYPE_DIGIT) && (unicode_cpt_type(utf_char) != CODEPOINT_TYPE_WHITESPACE)) ||
|
12212
|
-
(!token.size() && utf_char == " " && unicode_cpt_type(utf_char_next) != CODEPOINT_TYPE_LETTER && unicode_cpt_type(utf_char_next) != CODEPOINT_TYPE_DIGIT && unicode_cpt_type(utf_char_next) != CODEPOINT_TYPE_WHITESPACE)
|
12213
|
-
) {
|
12214
|
-
collecting_special = true;
|
12215
|
-
collecting = true;
|
12216
|
-
}
|
12217
|
-
else if (unicode_cpt_type(utf_char) == CODEPOINT_TYPE_WHITESPACE && unicode_cpt_type(utf_char_next) == CODEPOINT_TYPE_WHITESPACE) {
|
12218
|
-
collecting_whitespace_lookahead = true;
|
12219
|
-
collecting = true;
|
12220
|
-
}
|
12221
|
-
else if (unicode_cpt_type(utf_char) == CODEPOINT_TYPE_WHITESPACE) {
|
12222
|
-
split_condition = true;
|
12223
|
-
}
|
12224
|
-
}
|
12225
|
-
else if (!split_condition && collecting) {
|
12226
|
-
if (collecting_letter && unicode_cpt_type(utf_char) != CODEPOINT_TYPE_LETTER) {
|
12227
|
-
split_condition = true;
|
12228
|
-
}
|
12229
|
-
else if (collecting_numeric && unicode_cpt_type(utf_char) != CODEPOINT_TYPE_DIGIT) {
|
12230
|
-
split_condition = true;
|
12231
|
-
}
|
12232
|
-
else if (collecting_special && (unicode_cpt_type(utf_char) == CODEPOINT_TYPE_LETTER || unicode_cpt_type(utf_char) == CODEPOINT_TYPE_DIGIT || unicode_cpt_type(utf_char) == CODEPOINT_TYPE_WHITESPACE)) {
|
12233
|
-
split_condition = true;
|
12234
|
-
}
|
12235
|
-
else if (collecting_whitespace_lookahead && (unicode_cpt_type(utf_char_next) == CODEPOINT_TYPE_LETTER || unicode_cpt_type(utf_char_next) == CODEPOINT_TYPE_DIGIT)) {
|
12236
|
-
split_condition = true;
|
12237
|
-
}
|
12238
|
-
}
|
12239
|
-
|
12240
|
-
if (utf_char_next == "") {
|
12241
|
-
split_condition = true; // final
|
12242
|
-
token += utf_char;
|
12243
|
-
}
|
12244
|
-
|
12245
|
-
if (split_condition) {
|
12246
|
-
if (token.size()) {
|
12247
|
-
bpe_words.emplace_back(token);
|
12248
|
-
}
|
12249
|
-
token = utf_char;
|
12250
|
-
collecting = false;
|
12251
|
-
collecting_letter = false;
|
12252
|
-
collecting_numeric = false;
|
12253
|
-
collecting_special = false;
|
12254
|
-
collecting_whitespace_lookahead = false;
|
12255
|
-
}
|
12256
|
-
else {
|
12257
|
-
token += utf_char;
|
12258
|
-
}
|
12259
|
-
}
|
12260
|
-
|
12261
|
-
for (std::string & word : bpe_words) {
|
12262
|
-
std::string encoded_token = "";
|
12263
|
-
for (char & c : word) {
|
12264
|
-
encoded_token += unicode_byte_to_utf8(c);
|
12265
|
-
}
|
12266
|
-
bpe_encoded_words.emplace_back(encoded_token);
|
12267
|
-
}
|
12268
|
-
|
12269
|
-
return bpe_encoded_words;
|
12270
|
-
}
|
12271
|
-
|
12272
12417
|
const llama_vocab & vocab;
|
12273
12418
|
|
12274
12419
|
std::vector<llm_symbol> symbols;
|
@@ -12343,7 +12488,7 @@ struct llm_tokenizer_wpm {
|
|
12343
12488
|
continue;
|
12344
12489
|
}
|
12345
12490
|
code = unicode_tolower(code);
|
12346
|
-
if (type ==
|
12491
|
+
if (type == CODEPOINT_TYPE_SEPARATOR) {
|
12347
12492
|
code = ' ';
|
12348
12493
|
}
|
12349
12494
|
std::string s = unicode_cpt_to_utf8(code);
|
@@ -12588,7 +12733,7 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
|
|
12588
12733
|
} break;
|
12589
12734
|
case LLAMA_VOCAB_TYPE_BPE:
|
12590
12735
|
{
|
12591
|
-
if (add_special && vocab.special_add_bos
|
12736
|
+
if (add_special && vocab.special_add_bos != 0) {
|
12592
12737
|
GGML_ASSERT(vocab.special_bos_id != -1);
|
12593
12738
|
output.push_back(vocab.special_bos_id);
|
12594
12739
|
}
|
@@ -14030,13 +14175,16 @@ static void llama_tensor_dequantize_internal(
|
|
14030
14175
|
if (qtype.to_float == NULL) {
|
14031
14176
|
throw std::runtime_error(format("type %s unsupported for integer quantization: no dequantization available", ggml_type_name(tensor->type)));
|
14032
14177
|
}
|
14033
|
-
} else if (tensor->type != GGML_TYPE_F16
|
14178
|
+
} else if (tensor->type != GGML_TYPE_F16 &&
|
14179
|
+
tensor->type != GGML_TYPE_BF16) {
|
14034
14180
|
throw std::runtime_error(format("cannot dequantize/convert tensor type %s", ggml_type_name(tensor->type)));
|
14035
14181
|
}
|
14036
14182
|
|
14037
14183
|
if (nthread < 2) {
|
14038
14184
|
if (tensor->type == GGML_TYPE_F16) {
|
14039
14185
|
ggml_fp16_to_fp32_row((ggml_fp16_t *)tensor->data, f32_output, nelements);
|
14186
|
+
} else if (tensor->type == GGML_TYPE_BF16) {
|
14187
|
+
ggml_bf16_to_fp32_row((ggml_bf16_t *)tensor->data, f32_output, nelements);
|
14040
14188
|
} else if (ggml_is_quantized(tensor->type)) {
|
14041
14189
|
qtype.to_float(tensor->data, f32_output, nelements);
|
14042
14190
|
} else {
|
@@ -14045,7 +14193,14 @@ static void llama_tensor_dequantize_internal(
|
|
14045
14193
|
return;
|
14046
14194
|
}
|
14047
14195
|
|
14048
|
-
size_t block_size
|
14196
|
+
size_t block_size;
|
14197
|
+
if (tensor->type == GGML_TYPE_F16 ||
|
14198
|
+
tensor->type == GGML_TYPE_BF16) {
|
14199
|
+
block_size = 1;
|
14200
|
+
} else {
|
14201
|
+
block_size = (size_t)ggml_blck_size(tensor->type);
|
14202
|
+
}
|
14203
|
+
|
14049
14204
|
size_t block_size_bytes = ggml_type_size(tensor->type);
|
14050
14205
|
|
14051
14206
|
GGML_ASSERT(nelements % block_size == 0);
|
@@ -14064,6 +14219,8 @@ static void llama_tensor_dequantize_internal(
|
|
14064
14219
|
auto compute = [qtype] (ggml_type typ, uint8_t * inbuf, float * outbuf, int nels) {
|
14065
14220
|
if (typ == GGML_TYPE_F16) {
|
14066
14221
|
ggml_fp16_to_fp32_row((ggml_fp16_t *)inbuf, outbuf, nels);
|
14222
|
+
} else if (typ == GGML_TYPE_BF16) {
|
14223
|
+
ggml_bf16_to_fp32_row((ggml_bf16_t *)inbuf, outbuf, nels);
|
14067
14224
|
} else {
|
14068
14225
|
qtype.to_float(inbuf, outbuf, nels);
|
14069
14226
|
}
|
@@ -14360,14 +14517,20 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
|
|
14360
14517
|
}
|
14361
14518
|
|
14362
14519
|
static size_t llama_tensor_quantize_internal(enum ggml_type new_type, const float * f32_data, void * new_data, const int64_t chunk_size, int64_t nrows, int64_t n_per_row, const float * imatrix, std::vector<std::thread> & workers, const int nthread) {
|
14363
|
-
std::mutex mutex;
|
14364
|
-
int64_t counter = 0;
|
14365
|
-
size_t new_size = 0;
|
14366
14520
|
if (nthread < 2) {
|
14367
14521
|
// single-thread
|
14368
|
-
|
14522
|
+
size_t new_size = ggml_quantize_chunk(new_type, f32_data, new_data, 0, nrows, n_per_row, imatrix);
|
14523
|
+
if (!ggml_validate_row_data(new_type, new_data, new_size)) {
|
14524
|
+
throw std::runtime_error("quantized data validation failed");
|
14525
|
+
}
|
14526
|
+
return new_size;
|
14369
14527
|
}
|
14370
|
-
|
14528
|
+
|
14529
|
+
std::mutex mutex;
|
14530
|
+
int64_t counter = 0;
|
14531
|
+
size_t new_size = 0;
|
14532
|
+
bool valid = true;
|
14533
|
+
auto compute = [&mutex, &counter, &new_size, &valid, new_type, f32_data, new_data, chunk_size,
|
14371
14534
|
nrows, n_per_row, imatrix]() {
|
14372
14535
|
const int64_t nrows_per_chunk = chunk_size / n_per_row;
|
14373
14536
|
size_t local_size = 0;
|
@@ -14382,7 +14545,17 @@ static size_t llama_tensor_quantize_internal(enum ggml_type new_type, const floa
|
|
14382
14545
|
}
|
14383
14546
|
lock.unlock();
|
14384
14547
|
const int64_t this_nrow = std::min(nrows - first_row, nrows_per_chunk);
|
14385
|
-
|
14548
|
+
size_t this_size = ggml_quantize_chunk(new_type, f32_data, new_data, first_row * n_per_row, this_nrow, n_per_row, imatrix);
|
14549
|
+
local_size += this_size;
|
14550
|
+
|
14551
|
+
// validate the quantized data
|
14552
|
+
const size_t row_size = ggml_row_size(new_type, n_per_row);
|
14553
|
+
void * this_data = (char *) new_data + first_row * row_size;
|
14554
|
+
if (!ggml_validate_row_data(new_type, this_data, this_size)) {
|
14555
|
+
std::unique_lock<std::mutex> lock(mutex);
|
14556
|
+
valid = false;
|
14557
|
+
break;
|
14558
|
+
}
|
14386
14559
|
}
|
14387
14560
|
};
|
14388
14561
|
for (int it = 0; it < nthread - 1; ++it) {
|
@@ -14391,6 +14564,9 @@ static size_t llama_tensor_quantize_internal(enum ggml_type new_type, const floa
|
|
14391
14564
|
compute();
|
14392
14565
|
for (auto & w : workers) { w.join(); }
|
14393
14566
|
workers.clear();
|
14567
|
+
if (!valid) {
|
14568
|
+
throw std::runtime_error("quantized data validation failed");
|
14569
|
+
}
|
14394
14570
|
return new_size;
|
14395
14571
|
}
|
14396
14572
|
|
@@ -14405,6 +14581,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
14405
14581
|
case LLAMA_FTYPE_MOSTLY_Q5_1: default_type = GGML_TYPE_Q5_1; break;
|
14406
14582
|
case LLAMA_FTYPE_MOSTLY_Q8_0: default_type = GGML_TYPE_Q8_0; break;
|
14407
14583
|
case LLAMA_FTYPE_MOSTLY_F16: default_type = GGML_TYPE_F16; break;
|
14584
|
+
case LLAMA_FTYPE_MOSTLY_BF16: default_type = GGML_TYPE_BF16; break;
|
14408
14585
|
case LLAMA_FTYPE_ALL_F32: default_type = GGML_TYPE_F32; break;
|
14409
14586
|
|
14410
14587
|
// K-quants
|
@@ -14453,7 +14630,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
14453
14630
|
auto v = (std::vector<llama_model_kv_override>*)params->kv_overrides;
|
14454
14631
|
kv_overrides = v->data();
|
14455
14632
|
}
|
14456
|
-
llama_model_loader ml(fname_inp, use_mmap, kv_overrides);
|
14633
|
+
llama_model_loader ml(fname_inp, use_mmap, /*check_tensors*/ true, kv_overrides);
|
14457
14634
|
ml.init_mappings(false); // no prefetching
|
14458
14635
|
|
14459
14636
|
llama_model model;
|
@@ -14491,11 +14668,13 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
14491
14668
|
for (auto & o : overrides) {
|
14492
14669
|
if (o.key[0] == 0) break;
|
14493
14670
|
if (o.tag == LLAMA_KV_OVERRIDE_TYPE_FLOAT) {
|
14494
|
-
gguf_set_val_f32(ctx_out, o.key, o.
|
14671
|
+
gguf_set_val_f32(ctx_out, o.key, o.val_f64);
|
14495
14672
|
} else if (o.tag == LLAMA_KV_OVERRIDE_TYPE_INT) {
|
14496
|
-
gguf_set_val_i32(ctx_out, o.key, o.
|
14673
|
+
gguf_set_val_i32(ctx_out, o.key, o.val_i64);
|
14497
14674
|
} else if (o.tag == LLAMA_KV_OVERRIDE_TYPE_BOOL) {
|
14498
|
-
gguf_set_val_bool(ctx_out, o.key, o.
|
14675
|
+
gguf_set_val_bool(ctx_out, o.key, o.val_bool);
|
14676
|
+
} else if (o.tag == LLAMA_KV_OVERRIDE_TYPE_STR) {
|
14677
|
+
gguf_set_val_str(ctx_out, o.key, o.val_str);
|
14499
14678
|
} else {
|
14500
14679
|
LLAMA_LOG_WARN("%s: unknown KV override type for key %s\n", __func__, o.key);
|
14501
14680
|
}
|
@@ -14814,7 +14993,7 @@ static int llama_apply_lora_from_file_internal(
|
|
14814
14993
|
std::unique_ptr<llama_model_loader> ml;
|
14815
14994
|
if (path_base_model) {
|
14816
14995
|
LLAMA_LOG_INFO("%s: loading base model from '%s'\n", __func__, path_base_model);
|
14817
|
-
ml.reset(new llama_model_loader(path_base_model, /*use_mmap*/ true, /*kv_overrides*/ nullptr));
|
14996
|
+
ml.reset(new llama_model_loader(path_base_model, /*use_mmap*/ true, /*check_tensors*/ false, /*kv_overrides*/ nullptr));
|
14818
14997
|
ml->init_mappings(/*prefetch*/ false); // no prefetching
|
14819
14998
|
}
|
14820
14999
|
|
@@ -15073,6 +15252,7 @@ struct llama_model_params llama_model_default_params() {
|
|
15073
15252
|
/*.vocab_only =*/ false,
|
15074
15253
|
/*.use_mmap =*/ true,
|
15075
15254
|
/*.use_mlock =*/ false,
|
15255
|
+
/*.check_tensors =*/ false,
|
15076
15256
|
};
|
15077
15257
|
|
15078
15258
|
#ifdef GGML_USE_METAL
|
@@ -15109,6 +15289,7 @@ struct llama_context_params llama_context_default_params() {
|
|
15109
15289
|
/*.logits_all =*/ false,
|
15110
15290
|
/*.embeddings =*/ false,
|
15111
15291
|
/*.offload_kqv =*/ true,
|
15292
|
+
/*.flash_attn =*/ false,
|
15112
15293
|
/*.abort_callback =*/ nullptr,
|
15113
15294
|
/*.abort_callback_data =*/ nullptr,
|
15114
15295
|
};
|
@@ -15275,6 +15456,7 @@ struct llama_context * llama_new_context_with_model(
|
|
15275
15456
|
cparams.defrag_thold = params.defrag_thold;
|
15276
15457
|
cparams.embeddings = params.embeddings;
|
15277
15458
|
cparams.offload_kqv = params.offload_kqv;
|
15459
|
+
cparams.flash_attn = params.flash_attn;
|
15278
15460
|
cparams.pooling_type = params.pooling_type;
|
15279
15461
|
|
15280
15462
|
cparams.n_ctx = params.n_ctx == 0 ? hparams.n_ctx_train : params.n_ctx;
|
@@ -15282,12 +15464,20 @@ struct llama_context * llama_new_context_with_model(
|
|
15282
15464
|
cparams.rope_freq_scale = params.rope_freq_scale == 0.0f ? hparams.rope_freq_scale_train : params.rope_freq_scale;
|
15283
15465
|
|
15284
15466
|
// this is necessary due to kv_self.n being padded later during inference
|
15285
|
-
cparams.n_ctx
|
15467
|
+
cparams.n_ctx = GGML_PAD(cparams.n_ctx, 256);
|
15286
15468
|
|
15287
15469
|
// with causal attention, the batch size is limited by the context size
|
15288
15470
|
cparams.n_batch = hparams.causal_attn ? std::min(cparams.n_ctx, params.n_batch) : params.n_batch;
|
15289
|
-
cparams.n_ubatch = std::min(cparams.n_batch, params.n_ubatch == 0 ? params.n_batch : params.n_ubatch);
|
15290
15471
|
|
15472
|
+
// the batch has to be at least GGML_KQ_MASK_PAD because we will be padding the KQ_mask
|
15473
|
+
// this is required by GPU kernels in order to avoid out-of-bounds accesses (e.g. ggml_flash_attn_ext)
|
15474
|
+
// ref: https://github.com/ggerganov/llama.cpp/pull/5021
|
15475
|
+
if (cparams.n_batch < GGML_KQ_MASK_PAD) {
|
15476
|
+
LLAMA_LOG_WARN("%s: n_batch is less than GGML_KQ_MASK_PAD - increasing to %d\n", __func__, GGML_KQ_MASK_PAD);
|
15477
|
+
cparams.n_batch = GGML_KQ_MASK_PAD;
|
15478
|
+
}
|
15479
|
+
|
15480
|
+
cparams.n_ubatch = std::min(cparams.n_batch, params.n_ubatch == 0 ? params.n_batch : params.n_ubatch);
|
15291
15481
|
|
15292
15482
|
cparams.n_yarn_orig_ctx = params.yarn_orig_ctx != 0 ? params.yarn_orig_ctx :
|
15293
15483
|
hparams.n_yarn_orig_ctx != 0 ? hparams.n_yarn_orig_ctx :
|
@@ -15319,6 +15509,16 @@ struct llama_context * llama_new_context_with_model(
|
|
15319
15509
|
}
|
15320
15510
|
}
|
15321
15511
|
|
15512
|
+
if (cparams.flash_attn && hparams.use_alibi) {
|
15513
|
+
LLAMA_LOG_WARN("%s: flash_attn is not yet compatible with ALiBi - forcing off\n", __func__);
|
15514
|
+
cparams.flash_attn = false;
|
15515
|
+
}
|
15516
|
+
|
15517
|
+
if (cparams.flash_attn && model->arch == LLM_ARCH_GROK) {
|
15518
|
+
LLAMA_LOG_WARN("%s: flash_attn is not compatible with Grok - forcing off\n", __func__);
|
15519
|
+
cparams.flash_attn = false;
|
15520
|
+
}
|
15521
|
+
|
15322
15522
|
if (params.seed == LLAMA_DEFAULT_SEED) {
|
15323
15523
|
params.seed = time(NULL);
|
15324
15524
|
}
|
@@ -15326,6 +15526,7 @@ struct llama_context * llama_new_context_with_model(
|
|
15326
15526
|
LLAMA_LOG_INFO("%s: n_ctx = %u\n", __func__, cparams.n_ctx);
|
15327
15527
|
LLAMA_LOG_INFO("%s: n_batch = %u\n", __func__, cparams.n_batch);
|
15328
15528
|
LLAMA_LOG_INFO("%s: n_ubatch = %u\n", __func__, cparams.n_ubatch);
|
15529
|
+
LLAMA_LOG_INFO("%s: flash_attn = %d\n", __func__, cparams.flash_attn);
|
15329
15530
|
LLAMA_LOG_INFO("%s: freq_base = %.1f\n", __func__, cparams.rope_freq_base);
|
15330
15531
|
LLAMA_LOG_INFO("%s: freq_scale = %g\n", __func__, cparams.rope_freq_scale);
|
15331
15532
|
|
@@ -15454,7 +15655,7 @@ struct llama_context * llama_new_context_with_model(
|
|
15454
15655
|
}
|
15455
15656
|
ctx->backends.push_back(ctx->backend_cpu);
|
15456
15657
|
|
15457
|
-
if (!llama_kv_cache_init(ctx->kv_self, ctx
|
15658
|
+
if (!llama_kv_cache_init(ctx->kv_self, ctx, type_k, type_v, kv_size, cparams.offload_kqv)) {
|
15458
15659
|
LLAMA_LOG_ERROR("%s: llama_kv_cache_init() failed for self-attention cache\n", __func__);
|
15459
15660
|
llama_free(ctx);
|
15460
15661
|
return nullptr;
|
@@ -16053,6 +16254,7 @@ size_t llama_state_get_size(const struct llama_context * ctx) {
|
|
16053
16254
|
const size_t s_kv_head = sizeof(uint32_t);
|
16054
16255
|
const size_t s_kv_size = sizeof(uint32_t);
|
16055
16256
|
const size_t s_kv_used = sizeof(uint32_t);
|
16257
|
+
const size_t s_v_trans = sizeof(uint32_t);
|
16056
16258
|
const size_t s_kv = ctx->kv_self.total_size();
|
16057
16259
|
const size_t s_kv_cell = sizeof(llama_pos) + sizeof(size_t) + cparams.n_seq_max*sizeof(llama_seq_id);
|
16058
16260
|
const size_t s_kv_cells = ctx->kv_self.size * s_kv_cell;
|
@@ -16070,10 +16272,14 @@ size_t llama_state_get_size(const struct llama_context * ctx) {
|
|
16070
16272
|
+ s_kv_head
|
16071
16273
|
+ s_kv_size
|
16072
16274
|
+ s_kv_used
|
16275
|
+
+ s_v_trans
|
16073
16276
|
+ s_kv
|
16074
16277
|
+ s_kv_cells
|
16075
16278
|
);
|
16076
16279
|
|
16280
|
+
// on session change it is very likely that the state size has changed - so we need to update this function
|
16281
|
+
static_assert(LLAMA_SESSION_VERSION == 6, "So you just bumped the session version - good. But did you remember to update llama_state_get_size?");
|
16282
|
+
|
16077
16283
|
return s_total;
|
16078
16284
|
}
|
16079
16285
|
|
@@ -16219,11 +16425,13 @@ static void llama_state_get_data_internal(struct llama_context * ctx, llama_data
|
|
16219
16425
|
const uint32_t kv_size = kv_self.size;
|
16220
16426
|
const size_t kv_buf_size = kv_self.total_size() / (kv_size ? kv_size : 1) * kv_head;
|
16221
16427
|
const uint32_t kv_used = kv_self.used;
|
16428
|
+
const uint32_t v_trans = kv_self.v_trans ? 1 : 0;
|
16222
16429
|
|
16223
16430
|
data_ctx->write(&kv_buf_size, sizeof(kv_buf_size));
|
16224
16431
|
data_ctx->write(&kv_head, sizeof(kv_head));
|
16225
16432
|
data_ctx->write(&kv_size, sizeof(kv_size));
|
16226
16433
|
data_ctx->write(&kv_used, sizeof(kv_used));
|
16434
|
+
data_ctx->write(&v_trans, sizeof(v_trans));
|
16227
16435
|
|
16228
16436
|
if (kv_buf_size) {
|
16229
16437
|
const size_t pre_kv_buf_size = data_ctx->get_size_written();
|
@@ -16236,7 +16444,7 @@ static void llama_state_get_data_internal(struct llama_context * ctx, llama_data
|
|
16236
16444
|
ggml_backend_tensor_get(kv_self.k_l[il], tmp_buf.data(), 0, tmp_buf.size());
|
16237
16445
|
data_ctx->write(tmp_buf.data(), tmp_buf.size());
|
16238
16446
|
|
16239
|
-
if (kv_self.recurrent) {
|
16447
|
+
if (kv_self.recurrent || !kv_self.v_trans) {
|
16240
16448
|
// v is contiguous for recurrent models
|
16241
16449
|
// TODO: use other tensors for state models than k and v
|
16242
16450
|
const size_t v_size = ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa*kv_head);
|
@@ -16369,11 +16577,15 @@ size_t llama_state_set_data(struct llama_context * ctx, const uint8_t * src) {
|
|
16369
16577
|
uint32_t kv_head;
|
16370
16578
|
uint32_t kv_size;
|
16371
16579
|
uint32_t kv_used;
|
16580
|
+
uint32_t v_trans;
|
16372
16581
|
|
16373
16582
|
memcpy(&kv_buf_size, inp, sizeof(kv_buf_size)); inp += sizeof(kv_buf_size);
|
16374
16583
|
memcpy(&kv_head, inp, sizeof(kv_head)); inp += sizeof(kv_head);
|
16375
16584
|
memcpy(&kv_size, inp, sizeof(kv_size)); inp += sizeof(kv_size);
|
16376
16585
|
memcpy(&kv_used, inp, sizeof(kv_used)); inp += sizeof(kv_used);
|
16586
|
+
memcpy(&v_trans, inp, sizeof(v_trans)); inp += sizeof(v_trans);
|
16587
|
+
|
16588
|
+
GGML_ASSERT(kv_self.v_trans == (bool) v_trans); // incompatible V transposition
|
16377
16589
|
|
16378
16590
|
if (kv_self.size != kv_size) {
|
16379
16591
|
// the KV cache needs to be big enough to load all the KV cells from the saved state
|
@@ -16383,6 +16595,8 @@ size_t llama_state_set_data(struct llama_context * ctx, const uint8_t * src) {
|
|
16383
16595
|
__func__, kv_head, kv_size, kv_self.size);
|
16384
16596
|
}
|
16385
16597
|
|
16598
|
+
llama_kv_cache_clear(ctx);
|
16599
|
+
|
16386
16600
|
if (kv_buf_size) {
|
16387
16601
|
const size_t pre_kv_buf_size = inp - src;
|
16388
16602
|
|
@@ -16394,7 +16608,7 @@ size_t llama_state_set_data(struct llama_context * ctx, const uint8_t * src) {
|
|
16394
16608
|
ggml_backend_tensor_set(kv_self.k_l[il], inp, 0, k_size);
|
16395
16609
|
inp += k_size;
|
16396
16610
|
|
16397
|
-
if (kv_self.recurrent) {
|
16611
|
+
if (kv_self.recurrent || !kv_self.v_trans) {
|
16398
16612
|
// v is contiguous for recurrent models
|
16399
16613
|
// TODO: use other tensors for state models than k and v
|
16400
16614
|
const size_t v_size = ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa*kv_head);
|
@@ -16416,8 +16630,6 @@ size_t llama_state_set_data(struct llama_context * ctx, const uint8_t * src) {
|
|
16416
16630
|
GGML_ASSERT(kv_buf_size == inp - src - pre_kv_buf_size);
|
16417
16631
|
}
|
16418
16632
|
|
16419
|
-
llama_kv_cache_clear(ctx);
|
16420
|
-
|
16421
16633
|
ctx->kv_self.head = kv_head;
|
16422
16634
|
ctx->kv_self.used = kv_used;
|
16423
16635
|
|
@@ -16677,28 +16889,49 @@ static size_t llama_state_seq_get_data_internal(struct llama_context * ctx, llam
|
|
16677
16889
|
}
|
16678
16890
|
}
|
16679
16891
|
|
16680
|
-
//
|
16681
|
-
|
16682
|
-
|
16683
|
-
|
16684
|
-
|
16685
|
-
|
16892
|
+
// TODO: simplify, reduce copy-paste
|
16893
|
+
if (!kv_self.v_trans) {
|
16894
|
+
for (int il = 0; il < (int)n_layer; ++il) {
|
16895
|
+
// Write value type
|
16896
|
+
const int32_t v_type_i = (int32_t)kv_self.v_l[il]->type;
|
16897
|
+
data_ctx.write(&v_type_i, sizeof(v_type_i));
|
16686
16898
|
|
16687
|
-
|
16688
|
-
|
16689
|
-
|
16899
|
+
// Write row size of value
|
16900
|
+
const size_t v_size_row = ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa);
|
16901
|
+
data_ctx.write(&v_size_row, sizeof(v_size_row));
|
16690
16902
|
|
16691
|
-
|
16692
|
-
for (uint32_t j = 0; j < n_embd_v_gqa; ++j) {
|
16693
|
-
// Read each range of cells of v_size_el length each into tmp_buf and write out
|
16903
|
+
// Read each range of cells of v_size length each into tmp_buf and write out
|
16694
16904
|
for (const auto & range : cell_ranges) {
|
16695
16905
|
const size_t range_size = range.second - range.first;
|
16696
|
-
|
16697
|
-
tmp_buf.
|
16698
|
-
ggml_backend_tensor_get(kv_self.v_l[il], tmp_buf.data(), src_offset, tmp_buf.size());
|
16906
|
+
tmp_buf.resize(range_size * v_size_row);
|
16907
|
+
ggml_backend_tensor_get(kv_self.v_l[il], tmp_buf.data(), range.first * v_size_row, range_size * v_size_row);
|
16699
16908
|
data_ctx.write(tmp_buf.data(), tmp_buf.size());
|
16700
16909
|
}
|
16701
16910
|
}
|
16911
|
+
} else {
|
16912
|
+
// For the values, they are transposed, so we also need the element size and get the element ranges from each row
|
16913
|
+
const uint32_t kv_size = kv_self.size;
|
16914
|
+
for (int il = 0; il < (int)n_layer; ++il) {
|
16915
|
+
// Write value type
|
16916
|
+
const int32_t v_type_i = (int32_t)kv_self.v_l[il]->type;
|
16917
|
+
data_ctx.write(&v_type_i, sizeof(v_type_i));
|
16918
|
+
|
16919
|
+
// Write element size
|
16920
|
+
const size_t v_size_el = ggml_type_size(kv_self.v_l[il]->type);
|
16921
|
+
data_ctx.write(&v_size_el, sizeof(v_size_el));
|
16922
|
+
|
16923
|
+
// For each row, we get the element values of each cell
|
16924
|
+
for (uint32_t j = 0; j < n_embd_v_gqa; ++j) {
|
16925
|
+
// Read each range of cells of v_size_el length each into tmp_buf and write out
|
16926
|
+
for (const auto & range : cell_ranges) {
|
16927
|
+
const size_t range_size = range.second - range.first;
|
16928
|
+
const size_t src_offset = (range.first + j * kv_size) * v_size_el;
|
16929
|
+
tmp_buf.resize(range_size * v_size_el);
|
16930
|
+
ggml_backend_tensor_get(kv_self.v_l[il], tmp_buf.data(), src_offset, tmp_buf.size());
|
16931
|
+
data_ctx.write(tmp_buf.data(), tmp_buf.size());
|
16932
|
+
}
|
16933
|
+
}
|
16934
|
+
}
|
16702
16935
|
}
|
16703
16936
|
|
16704
16937
|
return data_ctx.get_size_written();
|
@@ -16823,41 +17056,75 @@ size_t llama_state_seq_set_data(struct llama_context * ctx, const uint8_t * src,
|
|
16823
17056
|
}
|
16824
17057
|
}
|
16825
17058
|
|
16826
|
-
//
|
16827
|
-
|
16828
|
-
|
16829
|
-
|
16830
|
-
|
16831
|
-
|
16832
|
-
|
16833
|
-
|
16834
|
-
|
16835
|
-
|
16836
|
-
|
16837
|
-
|
17059
|
+
// TODO: simplify, reduce copy-paste
|
17060
|
+
if (!kv_self.v_trans) {
|
17061
|
+
for (int il = 0; il < (int)n_layer; ++il) {
|
17062
|
+
// Read type of value
|
17063
|
+
int32_t v_type_i_ref;
|
17064
|
+
memcpy(&v_type_i_ref, inp, sizeof(v_type_i_ref));
|
17065
|
+
inp += sizeof(v_type_i_ref);
|
17066
|
+
const int32_t v_type_i = (int32_t)kv_self.v_l[il]->type;
|
17067
|
+
if (v_type_i != v_type_i_ref) {
|
17068
|
+
llama_kv_cache_seq_rm(kv_self, dest_seq_id, -1, -1);
|
17069
|
+
LLAMA_LOG_ERROR("%s: mismatched value type (%d != %d, layer %d)\n", __func__, v_type_i, v_type_i_ref, il);
|
17070
|
+
return 0;
|
17071
|
+
}
|
16838
17072
|
|
16839
|
-
|
16840
|
-
|
16841
|
-
|
16842
|
-
|
16843
|
-
|
16844
|
-
|
16845
|
-
|
16846
|
-
|
16847
|
-
|
16848
|
-
|
17073
|
+
// Read row size of value
|
17074
|
+
size_t v_size_row_ref;
|
17075
|
+
memcpy(&v_size_row_ref, inp, sizeof(v_size_row_ref));
|
17076
|
+
inp += sizeof(v_size_row_ref);
|
17077
|
+
const size_t v_size_row = ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa);
|
17078
|
+
if (v_size_row != v_size_row_ref) {
|
17079
|
+
llama_kv_cache_seq_rm(kv_self, dest_seq_id, -1, -1);
|
17080
|
+
LLAMA_LOG_ERROR("%s: mismatched value row size (%zu != %zu, layer %d)\n", __func__, v_size_row, v_size_row_ref, il);
|
17081
|
+
return 0;
|
17082
|
+
}
|
16849
17083
|
|
16850
|
-
|
16851
|
-
|
16852
|
-
|
16853
|
-
|
16854
|
-
|
16855
|
-
|
17084
|
+
if (cell_count) {
|
17085
|
+
// Read and set the values for the whole cell range
|
17086
|
+
ggml_backend_tensor_set(kv_self.v_l[il], inp, kv_head * v_size_row, cell_count * v_size_row);
|
17087
|
+
inp += cell_count * v_size_row;
|
17088
|
+
}
|
17089
|
+
}
|
17090
|
+
} else {
|
17091
|
+
// For each layer, read the values for each cell (transposed)
|
17092
|
+
for (int il = 0; il < (int)n_layer; ++il) {
|
17093
|
+
// Read type of value
|
17094
|
+
int32_t v_type_i_ref;
|
17095
|
+
memcpy(&v_type_i_ref, inp, sizeof(v_type_i_ref));
|
17096
|
+
inp += sizeof(v_type_i_ref);
|
17097
|
+
const int32_t v_type_i = (int32_t)kv_self.v_l[il]->type;
|
17098
|
+
if (v_type_i != v_type_i_ref) {
|
17099
|
+
llama_kv_cache_seq_rm(kv_self, dest_seq_id, -1, -1);
|
17100
|
+
LLAMA_LOG_ERROR("%s: mismatched value type (%d != %d, layer %d)\n", __func__, v_type_i, v_type_i_ref, il);
|
17101
|
+
return 0;
|
17102
|
+
}
|
17103
|
+
|
17104
|
+
// Read element size of value
|
17105
|
+
size_t v_size_el_ref;
|
17106
|
+
memcpy(&v_size_el_ref, inp, sizeof(v_size_el_ref));
|
17107
|
+
inp += sizeof(v_size_el_ref);
|
17108
|
+
const size_t v_size_el = ggml_type_size(kv_self.v_l[il]->type);
|
17109
|
+
if (v_size_el != v_size_el_ref) {
|
17110
|
+
llama_kv_cache_seq_rm(kv_self, dest_seq_id, -1, -1);
|
17111
|
+
LLAMA_LOG_ERROR("%s: mismatched value element size (%zu != %zu, layer %d)\n", __func__, v_size_el, v_size_el_ref, il);
|
17112
|
+
return 0;
|
17113
|
+
}
|
17114
|
+
|
17115
|
+
if (cell_count) {
|
17116
|
+
// For each row in the transposed matrix, read the values for the whole cell range
|
17117
|
+
for (uint32_t j = 0; j < n_embd_v_gqa; ++j) {
|
17118
|
+
const size_t dst_offset = (kv_head + j * kv_size) * v_size_el;
|
17119
|
+
ggml_backend_tensor_set(kv_self.v_l[il], inp, dst_offset, cell_count * v_size_el);
|
17120
|
+
inp += cell_count * v_size_el;
|
17121
|
+
}
|
16856
17122
|
}
|
16857
17123
|
}
|
16858
17124
|
}
|
16859
17125
|
|
16860
17126
|
const size_t nread = inp - src;
|
17127
|
+
|
16861
17128
|
return nread;
|
16862
17129
|
}
|
16863
17130
|
|
@@ -17238,9 +17505,10 @@ int32_t llama_tokenize(
|
|
17238
17505
|
|
17239
17506
|
static std::string llama_decode_text(const std::string & text) {
|
17240
17507
|
std::string decoded_text;
|
17241
|
-
|
17242
|
-
|
17243
|
-
|
17508
|
+
|
17509
|
+
const auto cpts = unicode_cpts_from_utf8(text);
|
17510
|
+
for (const auto cpt : cpts) {
|
17511
|
+
decoded_text += unicode_utf8_to_byte(unicode_cpt_to_utf8(cpt));
|
17244
17512
|
}
|
17245
17513
|
|
17246
17514
|
return decoded_text;
|
@@ -17604,7 +17872,7 @@ struct llama_timings llama_get_timings(struct llama_context * ctx) {
|
|
17604
17872
|
/*.t_eval_ms =*/ 1e-3 * ctx->t_eval_us,
|
17605
17873
|
|
17606
17874
|
/*.n_sample =*/ std::max(1, ctx->n_sample),
|
17607
|
-
/*.n_p_eval =*/ std::max(
|
17875
|
+
/*.n_p_eval =*/ std::max(0, ctx->n_p_eval),
|
17608
17876
|
/*.n_eval =*/ std::max(1, ctx->n_eval),
|
17609
17877
|
};
|
17610
17878
|
|
@@ -17654,9 +17922,9 @@ const char * llama_print_system_info(void) {
|
|
17654
17922
|
s += "VSX = " + std::to_string(ggml_cpu_has_vsx()) + " | ";
|
17655
17923
|
s += "MATMUL_INT8 = " + std::to_string(ggml_cpu_has_matmul_int8()) + " | ";
|
17656
17924
|
#ifdef GGML_USE_LLAMAFILE
|
17657
|
-
s += "
|
17925
|
+
s += "LLAMAFILE = 1 | ";
|
17658
17926
|
#else
|
17659
|
-
s += "
|
17927
|
+
s += "LLAMAFILE = 0 | ";
|
17660
17928
|
#endif
|
17661
17929
|
|
17662
17930
|
return s.c_str();
|