llama_cpp 0.14.6 → 0.15.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +22 -0
- data/README.md +2 -2
- data/ext/llama_cpp/extconf.rb +2 -1
- data/ext/llama_cpp/llama_cpp.cpp +90 -11
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +22 -3
- data/vendor/tmp/llama.cpp/Makefile +52 -22
- data/vendor/tmp/llama.cpp/ggml-alloc.c +8 -8
- data/vendor/tmp/llama.cpp/ggml-backend.c +21 -15
- data/vendor/tmp/llama.cpp/ggml-cuda.cu +6 -0
- data/vendor/tmp/llama.cpp/ggml-impl.h +262 -4
- data/vendor/tmp/llama.cpp/ggml-kompute.cpp +7 -0
- data/vendor/tmp/llama.cpp/ggml-metal.m +376 -176
- data/vendor/tmp/llama.cpp/ggml-metal.metal +654 -18
- data/vendor/tmp/llama.cpp/ggml-quants.c +284 -293
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +17 -7
- data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +5 -0
- data/vendor/tmp/llama.cpp/ggml.c +394 -44
- data/vendor/tmp/llama.cpp/ggml.h +22 -0
- data/vendor/tmp/llama.cpp/llama.cpp +996 -455
- data/vendor/tmp/llama.cpp/llama.h +46 -15
- data/vendor/tmp/llama.cpp/sgemm.cpp +437 -590
- data/vendor/tmp/llama.cpp/sgemm.h +4 -2
- data/vendor/tmp/llama.cpp/unicode-data.cpp +1 -1
- data/vendor/tmp/llama.cpp/unicode-data.h +2 -2
- data/vendor/tmp/llama.cpp/unicode.cpp +448 -39
- data/vendor/tmp/llama.cpp/unicode.h +2 -1
- metadata +3 -3
@@ -75,6 +75,7 @@
|
|
75
75
|
#include <forward_list>
|
76
76
|
#include <fstream>
|
77
77
|
#include <functional>
|
78
|
+
#include <future>
|
78
79
|
#include <initializer_list>
|
79
80
|
#include <locale>
|
80
81
|
#include <map>
|
@@ -107,7 +108,6 @@
|
|
107
108
|
#define LLAMA_MAX_NODES 8192
|
108
109
|
#define LLAMA_MAX_EXPERTS 60
|
109
110
|
|
110
|
-
|
111
111
|
//
|
112
112
|
// logging
|
113
113
|
//
|
@@ -211,6 +211,7 @@ enum llm_arch {
|
|
211
211
|
LLM_ARCH_QWEN2,
|
212
212
|
LLM_ARCH_QWEN2MOE,
|
213
213
|
LLM_ARCH_PHI2,
|
214
|
+
LLM_ARCH_PHI3,
|
214
215
|
LLM_ARCH_PLAMO,
|
215
216
|
LLM_ARCH_CODESHELL,
|
216
217
|
LLM_ARCH_ORION,
|
@@ -246,6 +247,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
|
246
247
|
{ LLM_ARCH_QWEN2, "qwen2" },
|
247
248
|
{ LLM_ARCH_QWEN2MOE, "qwen2moe" },
|
248
249
|
{ LLM_ARCH_PHI2, "phi2" },
|
250
|
+
{ LLM_ARCH_PHI3, "phi3" },
|
249
251
|
{ LLM_ARCH_PLAMO, "plamo" },
|
250
252
|
{ LLM_ARCH_CODESHELL, "codeshell" },
|
251
253
|
{ LLM_ARCH_ORION, "orion" },
|
@@ -314,6 +316,7 @@ enum llm_kv {
|
|
314
316
|
LLM_KV_SSM_TIME_STEP_RANK,
|
315
317
|
|
316
318
|
LLM_KV_TOKENIZER_MODEL,
|
319
|
+
LLM_KV_TOKENIZER_PRE,
|
317
320
|
LLM_KV_TOKENIZER_LIST,
|
318
321
|
LLM_KV_TOKENIZER_TOKEN_TYPE,
|
319
322
|
LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT,
|
@@ -390,6 +393,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
|
390
393
|
{ LLM_KV_SSM_TIME_STEP_RANK, "%s.ssm.time_step_rank" },
|
391
394
|
|
392
395
|
{ LLM_KV_TOKENIZER_MODEL, "tokenizer.ggml.model" },
|
396
|
+
{ LLM_KV_TOKENIZER_PRE, "tokenizer.ggml.pre" },
|
393
397
|
{ LLM_KV_TOKENIZER_LIST, "tokenizer.ggml.tokens" },
|
394
398
|
{ LLM_KV_TOKENIZER_TOKEN_TYPE, "tokenizer.ggml.token_type" },
|
395
399
|
{ LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT, "tokenizer.ggml.token_type_count" },
|
@@ -793,6 +797,23 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
|
|
793
797
|
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
794
798
|
},
|
795
799
|
},
|
800
|
+
{
|
801
|
+
LLM_ARCH_PHI3,
|
802
|
+
{
|
803
|
+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
804
|
+
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
805
|
+
{ LLM_TENSOR_OUTPUT, "output" },
|
806
|
+
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
807
|
+
{ LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
|
808
|
+
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
809
|
+
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
810
|
+
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
811
|
+
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
812
|
+
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
813
|
+
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
814
|
+
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
815
|
+
},
|
816
|
+
},
|
796
817
|
{
|
797
818
|
LLM_ARCH_PLAMO,
|
798
819
|
{
|
@@ -1600,12 +1621,12 @@ struct llama_mlock {
|
|
1600
1621
|
};
|
1601
1622
|
using llama_mlocks = std::vector<std::unique_ptr<llama_mlock>>;
|
1602
1623
|
|
1603
|
-
static std::string llama_token_to_piece(const struct llama_context * ctx, llama_token token) {
|
1624
|
+
static std::string llama_token_to_piece(const struct llama_context * ctx, llama_token token, bool special) {
|
1604
1625
|
std::vector<char> result(8, 0);
|
1605
|
-
const int n_tokens = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size());
|
1626
|
+
const int n_tokens = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size(), special);
|
1606
1627
|
if (n_tokens < 0) {
|
1607
1628
|
result.resize(-n_tokens);
|
1608
|
-
int check = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size());
|
1629
|
+
int check = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size(), special);
|
1609
1630
|
GGML_ASSERT(check == -n_tokens);
|
1610
1631
|
}
|
1611
1632
|
else {
|
@@ -1824,7 +1845,7 @@ struct llama_hparams {
|
|
1824
1845
|
float f_logit_scale = 0.0f;
|
1825
1846
|
|
1826
1847
|
bool causal_attn = true;
|
1827
|
-
bool
|
1848
|
+
bool use_alibi = false; // currently, we need KQ_pos data for ALiBi-based models
|
1828
1849
|
|
1829
1850
|
enum llama_pooling_type pooling_type = LLAMA_POOLING_TYPE_NONE;
|
1830
1851
|
enum llama_rope_type rope_type = LLAMA_ROPE_TYPE_NONE;
|
@@ -1914,6 +1935,7 @@ struct llama_cparams {
|
|
1914
1935
|
bool embeddings;
|
1915
1936
|
bool causal_attn;
|
1916
1937
|
bool offload_kqv;
|
1938
|
+
bool flash_attn;
|
1917
1939
|
|
1918
1940
|
enum llama_pooling_type pooling_type;
|
1919
1941
|
|
@@ -2017,8 +2039,8 @@ struct llama_kv_cache {
|
|
2017
2039
|
bool has_shift = false;
|
2018
2040
|
bool do_defrag = false;
|
2019
2041
|
bool do_copy = false;
|
2020
|
-
// with recurrent state models, a cell can hold the state for more than one past token
|
2021
|
-
bool
|
2042
|
+
bool recurrent = false; // with recurrent state models, a cell can hold the state for more than one past token
|
2043
|
+
bool v_trans = true; // the value tensor is transposed
|
2022
2044
|
|
2023
2045
|
// Note: The value of head isn't only used to optimize searching
|
2024
2046
|
// for a free KV slot. llama_decode_internal also uses it, so it
|
@@ -2095,7 +2117,8 @@ struct llama_vocab {
|
|
2095
2117
|
ttype type;
|
2096
2118
|
};
|
2097
2119
|
|
2098
|
-
enum llama_vocab_type
|
2120
|
+
enum llama_vocab_type type = LLAMA_VOCAB_TYPE_SPM;
|
2121
|
+
enum llama_vocab_pre_type type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
|
2099
2122
|
|
2100
2123
|
std::unordered_map<token, id> token_to_id;
|
2101
2124
|
std::vector<token_data> id_to_token;
|
@@ -2120,7 +2143,7 @@ struct llama_vocab {
|
|
2120
2143
|
id special_prefix_id = -1;
|
2121
2144
|
id special_suffix_id = -1;
|
2122
2145
|
id special_middle_id = -1;
|
2123
|
-
id special_eot_id = -1;
|
2146
|
+
id special_eot_id = -1; // TODO: move above after "eos_id", and here add "file separator" token
|
2124
2147
|
|
2125
2148
|
bool add_space_prefix = true;
|
2126
2149
|
|
@@ -2316,11 +2339,14 @@ struct llama_context {
|
|
2316
2339
|
|
2317
2340
|
static bool llama_kv_cache_init(
|
2318
2341
|
struct llama_kv_cache & cache,
|
2319
|
-
|
2342
|
+
const llama_context * ctx,
|
2320
2343
|
ggml_type type_k,
|
2321
2344
|
ggml_type type_v,
|
2322
2345
|
uint32_t kv_size,
|
2323
2346
|
bool offload) {
|
2347
|
+
const llama_model & model = ctx->model;
|
2348
|
+
const llama_cparams & cparams = ctx->cparams;
|
2349
|
+
|
2324
2350
|
const struct llama_hparams & hparams = model.hparams;
|
2325
2351
|
|
2326
2352
|
const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa() + hparams.n_embd_k_s();
|
@@ -2331,8 +2357,9 @@ static bool llama_kv_cache_init(
|
|
2331
2357
|
|
2332
2358
|
// TODO: find a nicer way to add other recurrent model architectures
|
2333
2359
|
cache.recurrent = model.arch == LLM_ARCH_MAMBA;
|
2360
|
+
cache.v_trans = !cparams.flash_attn;
|
2334
2361
|
|
2335
|
-
// TODO: support mixed
|
2362
|
+
// TODO: support mixed recurrent Transformer architectures
|
2336
2363
|
// NOTE: (!a || b) is a logical implication (a -> b)
|
2337
2364
|
GGML_ASSERT(!cache.recurrent || n_embd_k_gqa == hparams.n_embd_k_s());
|
2338
2365
|
GGML_ASSERT(!cache.recurrent || n_embd_v_gqa == hparams.n_embd_v_s());
|
@@ -2543,6 +2570,10 @@ static void llama_kv_cache_clear(struct llama_kv_cache & cache) {
|
|
2543
2570
|
}
|
2544
2571
|
cache.head = 0;
|
2545
2572
|
cache.used = 0;
|
2573
|
+
|
2574
|
+
for (auto & buf : cache.bufs) {
|
2575
|
+
ggml_backend_buffer_clear(buf, 0);
|
2576
|
+
}
|
2546
2577
|
}
|
2547
2578
|
|
2548
2579
|
static bool llama_kv_cache_seq_rm(
|
@@ -2863,6 +2894,7 @@ namespace GGUFMeta {
|
|
2863
2894
|
case LLAMA_KV_OVERRIDE_TYPE_BOOL: return "bool";
|
2864
2895
|
case LLAMA_KV_OVERRIDE_TYPE_INT: return "int";
|
2865
2896
|
case LLAMA_KV_OVERRIDE_TYPE_FLOAT: return "float";
|
2897
|
+
case LLAMA_KV_OVERRIDE_TYPE_STR: return "str";
|
2866
2898
|
}
|
2867
2899
|
return "unknown";
|
2868
2900
|
}
|
@@ -2874,13 +2906,16 @@ namespace GGUFMeta {
|
|
2874
2906
|
__func__, override_type_to_str(ovrd->tag), ovrd->key);
|
2875
2907
|
switch (ovrd->tag) {
|
2876
2908
|
case LLAMA_KV_OVERRIDE_TYPE_BOOL: {
|
2877
|
-
LLAMA_LOG_INFO("%s\n", ovrd->
|
2909
|
+
LLAMA_LOG_INFO("%s\n", ovrd->val_bool ? "true" : "false");
|
2878
2910
|
} break;
|
2879
2911
|
case LLAMA_KV_OVERRIDE_TYPE_INT: {
|
2880
|
-
LLAMA_LOG_INFO("%" PRId64 "\n", ovrd->
|
2912
|
+
LLAMA_LOG_INFO("%" PRId64 "\n", ovrd->val_i64);
|
2881
2913
|
} break;
|
2882
2914
|
case LLAMA_KV_OVERRIDE_TYPE_FLOAT: {
|
2883
|
-
LLAMA_LOG_INFO("%.6f\n", ovrd->
|
2915
|
+
LLAMA_LOG_INFO("%.6f\n", ovrd->val_f64);
|
2916
|
+
} break;
|
2917
|
+
case LLAMA_KV_OVERRIDE_TYPE_STR: {
|
2918
|
+
LLAMA_LOG_INFO("%s\n", ovrd->val_str);
|
2884
2919
|
} break;
|
2885
2920
|
default:
|
2886
2921
|
// Shouldn't be possible to end up here, but just in case...
|
@@ -2899,7 +2934,7 @@ namespace GGUFMeta {
|
|
2899
2934
|
static typename std::enable_if<std::is_same<OT, bool>::value, bool>::type
|
2900
2935
|
try_override(OT & target, const struct llama_model_kv_override * ovrd) {
|
2901
2936
|
if (validate_override(LLAMA_KV_OVERRIDE_TYPE_BOOL, ovrd)) {
|
2902
|
-
target = ovrd->
|
2937
|
+
target = ovrd->val_bool;
|
2903
2938
|
return true;
|
2904
2939
|
}
|
2905
2940
|
return false;
|
@@ -2909,7 +2944,7 @@ namespace GGUFMeta {
|
|
2909
2944
|
static typename std::enable_if<!std::is_same<OT, bool>::value && std::is_integral<OT>::value, bool>::type
|
2910
2945
|
try_override(OT & target, const struct llama_model_kv_override * ovrd) {
|
2911
2946
|
if (validate_override(LLAMA_KV_OVERRIDE_TYPE_INT, ovrd)) {
|
2912
|
-
target = ovrd->
|
2947
|
+
target = ovrd->val_i64;
|
2913
2948
|
return true;
|
2914
2949
|
}
|
2915
2950
|
return false;
|
@@ -2919,7 +2954,7 @@ namespace GGUFMeta {
|
|
2919
2954
|
static typename std::enable_if<std::is_floating_point<OT>::value, bool>::type
|
2920
2955
|
try_override(T & target, const struct llama_model_kv_override * ovrd) {
|
2921
2956
|
if (validate_override(LLAMA_KV_OVERRIDE_TYPE_FLOAT, ovrd)) {
|
2922
|
-
target = ovrd->
|
2957
|
+
target = ovrd->val_f64;
|
2923
2958
|
return true;
|
2924
2959
|
}
|
2925
2960
|
return false;
|
@@ -2928,12 +2963,11 @@ namespace GGUFMeta {
|
|
2928
2963
|
template<typename OT>
|
2929
2964
|
static typename std::enable_if<std::is_same<OT, std::string>::value, bool>::type
|
2930
2965
|
try_override(T & target, const struct llama_model_kv_override * ovrd) {
|
2931
|
-
(
|
2932
|
-
|
2933
|
-
|
2934
|
-
|
2935
|
-
|
2936
|
-
ovrd ? ovrd->key : "NULL"));
|
2966
|
+
if (validate_override(LLAMA_KV_OVERRIDE_TYPE_STR, ovrd)) {
|
2967
|
+
target = ovrd->val_str;
|
2968
|
+
return true;
|
2969
|
+
}
|
2970
|
+
return false;
|
2937
2971
|
}
|
2938
2972
|
|
2939
2973
|
static bool set(const gguf_context * ctx, const int k, T & target, const struct llama_model_kv_override * ovrd = nullptr) {
|
@@ -2966,6 +3000,7 @@ struct llama_model_loader {
|
|
2966
3000
|
size_t n_bytes = 0;
|
2967
3001
|
|
2968
3002
|
bool use_mmap = false;
|
3003
|
+
bool check_tensors;
|
2969
3004
|
|
2970
3005
|
llama_files files;
|
2971
3006
|
llama_ftype ftype;
|
@@ -2980,9 +3015,13 @@ struct llama_model_loader {
|
|
2980
3015
|
|
2981
3016
|
ggml_tensor * tensor;
|
2982
3017
|
|
2983
|
-
llama_tensor_weight(uint16_t idx, const char * name, const struct gguf_context * gguf_ctx, ggml_tensor * tensor) : idx(idx), tensor(tensor) {
|
3018
|
+
llama_tensor_weight(const llama_file * file, uint16_t idx, const char * name, const struct gguf_context * gguf_ctx, ggml_tensor * tensor) : idx(idx), tensor(tensor) {
|
2984
3019
|
const int tensor_idx = gguf_find_tensor(gguf_ctx, name);
|
2985
3020
|
offs = gguf_get_data_offset(gguf_ctx) + gguf_get_tensor_offset(gguf_ctx, tensor_idx);
|
3021
|
+
|
3022
|
+
if (offs + ggml_nbytes(tensor) < offs || offs + ggml_nbytes(tensor) > file->size) {
|
3023
|
+
throw std::runtime_error(format("tensor '%s' data is not within the file bounds, model is corrupted or incomplete", name));
|
3024
|
+
}
|
2986
3025
|
}
|
2987
3026
|
};
|
2988
3027
|
std::vector<llama_tensor_weight> weights;
|
@@ -2995,7 +3034,7 @@ struct llama_model_loader {
|
|
2995
3034
|
std::string arch_name;
|
2996
3035
|
LLM_KV llm_kv = LLM_KV(LLM_ARCH_UNKNOWN);
|
2997
3036
|
|
2998
|
-
llama_model_loader(const std::string & fname, bool use_mmap, const struct llama_model_kv_override * param_overrides_p) {
|
3037
|
+
llama_model_loader(const std::string & fname, bool use_mmap, bool check_tensors, const struct llama_model_kv_override * param_overrides_p) {
|
2999
3038
|
int trace = 0;
|
3000
3039
|
if (getenv("LLAMA_TRACE")) {
|
3001
3040
|
trace = atoi(getenv("LLAMA_TRACE"));
|
@@ -3021,15 +3060,15 @@ struct llama_model_loader {
|
|
3021
3060
|
get_key(llm_kv(LLM_KV_GENERAL_ARCHITECTURE), arch_name, false);
|
3022
3061
|
llm_kv = LLM_KV(llm_arch_from_string(arch_name));
|
3023
3062
|
|
3063
|
+
files.emplace_back(new llama_file(fname.c_str(), "rb"));
|
3064
|
+
contexts.emplace_back(ctx);
|
3065
|
+
|
3024
3066
|
// Save tensors data offset of the main file.
|
3025
3067
|
// For subsidiary files, `meta` tensor data offset must not be used,
|
3026
3068
|
// so we build a unified tensors index for weights.
|
3027
3069
|
for (ggml_tensor * cur = ggml_get_first_tensor(ctx); cur; cur = ggml_get_next_tensor(ctx, cur)) {
|
3028
|
-
weights.emplace_back(0, cur->name, meta, cur);
|
3070
|
+
weights.emplace_back(files.back().get(), 0, cur->name, meta, cur);
|
3029
3071
|
}
|
3030
|
-
files.emplace_back(new llama_file(fname.c_str(), "rb"));
|
3031
|
-
contexts.emplace_back(ctx);
|
3032
|
-
|
3033
3072
|
uint16_t n_split = 0;
|
3034
3073
|
get_key(llm_kv(LLM_KV_SPLIT_COUNT), n_split, false);
|
3035
3074
|
|
@@ -3063,12 +3102,13 @@ struct llama_model_loader {
|
|
3063
3102
|
throw std::runtime_error(format("%s: failed to load GGUF split from %s\n", __func__, split_path));
|
3064
3103
|
}
|
3065
3104
|
|
3105
|
+
files.emplace_back(new llama_file(split_path, "rb"));
|
3106
|
+
contexts.emplace_back(ctx);
|
3107
|
+
|
3066
3108
|
// Save tensors data offset info of the shard.
|
3067
3109
|
for (ggml_tensor * cur = ggml_get_first_tensor(ctx); cur; cur = ggml_get_next_tensor(ctx, cur)) {
|
3068
|
-
weights.emplace_back(idx, cur->name, ctx_gguf, cur);
|
3110
|
+
weights.emplace_back(files.back().get(), idx, cur->name, ctx_gguf, cur);
|
3069
3111
|
}
|
3070
|
-
files.emplace_back(new llama_file(split_path, "rb"));
|
3071
|
-
contexts.emplace_back(ctx);
|
3072
3112
|
|
3073
3113
|
gguf_free(ctx_gguf);
|
3074
3114
|
}
|
@@ -3091,9 +3131,17 @@ struct llama_model_loader {
|
|
3091
3131
|
|
3092
3132
|
fver = (enum llama_fver) gguf_get_version(meta);
|
3093
3133
|
|
3134
|
+
std::set<std::string> tensor_names;
|
3094
3135
|
for (auto & w : weights) {
|
3095
3136
|
n_elements += ggml_nelements(w.tensor);
|
3096
3137
|
n_bytes += ggml_nbytes(w.tensor);
|
3138
|
+
// make sure there is no duplicated tensor names
|
3139
|
+
const std::string name(w.tensor->name);
|
3140
|
+
auto found = tensor_names.find(name);
|
3141
|
+
if (found != tensor_names.end()) {
|
3142
|
+
throw std::runtime_error(format("invalid model: tensor '%s' is duplicated", w.tensor->name));
|
3143
|
+
}
|
3144
|
+
tensor_names.insert(name);
|
3097
3145
|
}
|
3098
3146
|
|
3099
3147
|
LLAMA_LOG_INFO("%s: loaded meta data with %d key-value pairs and %d tensors from %s (version %s)\n",
|
@@ -3199,6 +3247,7 @@ struct llama_model_loader {
|
|
3199
3247
|
}
|
3200
3248
|
|
3201
3249
|
this->use_mmap = use_mmap;
|
3250
|
+
this->check_tensors = check_tensors;
|
3202
3251
|
}
|
3203
3252
|
|
3204
3253
|
~llama_model_loader() {
|
@@ -3278,6 +3327,10 @@ struct llama_model_loader {
|
|
3278
3327
|
return nullptr;
|
3279
3328
|
}
|
3280
3329
|
|
3330
|
+
const llama_tensor_weight * get_weight(int i) const {
|
3331
|
+
return get_weight(get_tensor_name(i));
|
3332
|
+
}
|
3333
|
+
|
3281
3334
|
const llama_tensor_weight & require_weight(const char * name) const {
|
3282
3335
|
const llama_tensor_weight * weight = get_weight(name);
|
3283
3336
|
if (!weight) {
|
@@ -3453,6 +3506,10 @@ struct llama_model_loader {
|
|
3453
3506
|
file->seek(w.offs, SEEK_SET);
|
3454
3507
|
file->read_raw(cur->data, ggml_nbytes(cur));
|
3455
3508
|
}
|
3509
|
+
|
3510
|
+
if (check_tensors && !ggml_validate_row_data(cur->type, cur->data, ggml_nbytes(cur))) {
|
3511
|
+
throw std::runtime_error(format("tensor '%s' has invalid data", ggml_get_name(cur)));
|
3512
|
+
}
|
3456
3513
|
}
|
3457
3514
|
|
3458
3515
|
size_t size_done = 0;
|
@@ -3469,6 +3526,8 @@ struct llama_model_loader {
|
|
3469
3526
|
GGML_ASSERT(size_data != 0 && "call init_mappings() first");
|
3470
3527
|
|
3471
3528
|
std::vector<no_init<uint8_t>> read_buf;
|
3529
|
+
std::vector<std::future<std::pair<ggml_tensor *, bool>>> validation_result;
|
3530
|
+
|
3472
3531
|
for (struct ggml_tensor * cur = ggml_get_first_tensor(ctx); cur != NULL; cur = ggml_get_next_tensor(ctx, cur)) {
|
3473
3532
|
const auto * weight = get_weight(ggml_get_name(cur));
|
3474
3533
|
if (weight == nullptr) {
|
@@ -3490,37 +3549,66 @@ struct llama_model_loader {
|
|
3490
3549
|
if (bufs_mmap.count(weight->idx)) {
|
3491
3550
|
buf_mmap = bufs_mmap.at(weight->idx);
|
3492
3551
|
}
|
3552
|
+
uint8_t * data = (uint8_t *) mapping->addr + weight->offs;
|
3553
|
+
|
3554
|
+
if (check_tensors) {
|
3555
|
+
validation_result.emplace_back(std::async(std::launch::async, [cur, data, n_size] {
|
3556
|
+
return std::make_pair(cur, ggml_validate_row_data(cur->type, data, n_size));
|
3557
|
+
}));
|
3558
|
+
}
|
3559
|
+
|
3493
3560
|
GGML_ASSERT(buf_mmap || cur->data); // either we have a buffer to allocate the tensor in, or it is already allocated
|
3494
3561
|
if (buf_mmap && cur->data == nullptr) {
|
3495
|
-
ggml_backend_tensor_alloc(buf_mmap, cur,
|
3562
|
+
ggml_backend_tensor_alloc(buf_mmap, cur, data);
|
3496
3563
|
if (lmlocks) {
|
3497
3564
|
const auto & lmlock = lmlocks->at(weight->idx);
|
3498
|
-
lmlock->grow_to(weight->offs +
|
3565
|
+
lmlock->grow_to(weight->offs + n_size);
|
3499
3566
|
}
|
3500
3567
|
|
3501
3568
|
auto & mmap_used = mmaps_used[weight->idx];
|
3502
3569
|
mmap_used.first = std::min(mmap_used.first, weight->offs);
|
3503
3570
|
mmap_used.second = std::max(mmap_used.second, weight->offs + n_size);
|
3504
3571
|
} else {
|
3505
|
-
ggml_backend_tensor_set(cur,
|
3572
|
+
ggml_backend_tensor_set(cur, data, 0, n_size);
|
3506
3573
|
}
|
3507
3574
|
} else {
|
3508
3575
|
GGML_ASSERT(weight->idx < files.size());
|
3509
3576
|
const auto & file = files.at(weight->idx);
|
3510
3577
|
if (ggml_backend_buffer_is_host(cur->buffer)) {
|
3511
3578
|
file->seek(weight->offs, SEEK_SET);
|
3512
|
-
file->read_raw(cur->data,
|
3579
|
+
file->read_raw(cur->data, n_size);
|
3580
|
+
if (check_tensors) {
|
3581
|
+
validation_result.emplace_back(std::async(std::launch::async, [cur, n_size] {
|
3582
|
+
return std::make_pair(cur, ggml_validate_row_data(cur->type, cur->data, n_size));
|
3583
|
+
}));
|
3584
|
+
}
|
3513
3585
|
} else {
|
3514
|
-
read_buf.resize(
|
3586
|
+
read_buf.resize(n_size);
|
3515
3587
|
file->seek(weight->offs, SEEK_SET);
|
3516
|
-
file->read_raw(read_buf.data(),
|
3588
|
+
file->read_raw(read_buf.data(), n_size);
|
3517
3589
|
ggml_backend_tensor_set(cur, read_buf.data(), 0, n_size);
|
3590
|
+
if (check_tensors && !ggml_validate_row_data(cur->type, read_buf.data(), n_size)) {
|
3591
|
+
throw std::runtime_error(format("tensor '%s' has invalid data", ggml_get_name(cur)));
|
3592
|
+
}
|
3518
3593
|
}
|
3519
3594
|
}
|
3520
3595
|
|
3521
3596
|
size_done += n_size;
|
3522
3597
|
}
|
3523
3598
|
|
3599
|
+
// check validation results
|
3600
|
+
bool validation_failed = false;
|
3601
|
+
for (auto & future : validation_result) {
|
3602
|
+
auto result = future.get();
|
3603
|
+
if (!result.second) {
|
3604
|
+
LLAMA_LOG_ERROR("%s: tensor '%s' has invalid data\n", __func__, ggml_get_name(result.first));
|
3605
|
+
validation_failed = true;
|
3606
|
+
}
|
3607
|
+
}
|
3608
|
+
if (validation_failed) {
|
3609
|
+
throw std::runtime_error("found tensors with invalid data");
|
3610
|
+
}
|
3611
|
+
|
3524
3612
|
// check if this is the last call and do final cleanup
|
3525
3613
|
if (size_done >= size_data) {
|
3526
3614
|
// unmap offloaded tensors and metadata
|
@@ -3770,7 +3858,7 @@ static void llm_load_hparams(
|
|
3770
3858
|
switch (hparams.n_layer) {
|
3771
3859
|
case 22: model.type = e_model::MODEL_1B; break;
|
3772
3860
|
case 26: model.type = e_model::MODEL_3B; break;
|
3773
|
-
case 32: model.type = e_model::MODEL_7B; break;
|
3861
|
+
case 32: model.type = hparams.n_head == hparams.n_head_kv ? e_model::MODEL_7B : e_model::MODEL_8B; break; // LLaMa 8B v3 uses GQA
|
3774
3862
|
case 40: model.type = e_model::MODEL_13B; break;
|
3775
3863
|
case 48: model.type = e_model::MODEL_34B; break;
|
3776
3864
|
case 60: model.type = e_model::MODEL_30B; break;
|
@@ -3955,6 +4043,16 @@ static void llm_load_hparams(
|
|
3955
4043
|
{
|
3956
4044
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
3957
4045
|
|
4046
|
+
switch (hparams.n_layer) {
|
4047
|
+
case 24: model.type = e_model::MODEL_1B; break;
|
4048
|
+
case 32: model.type = e_model::MODEL_3B; break;
|
4049
|
+
default: model.type = e_model::MODEL_UNKNOWN;
|
4050
|
+
}
|
4051
|
+
} break;
|
4052
|
+
case LLM_ARCH_PHI3:
|
4053
|
+
{
|
4054
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
4055
|
+
|
3958
4056
|
switch (hparams.n_layer) {
|
3959
4057
|
case 24: model.type = e_model::MODEL_1B; break;
|
3960
4058
|
case 32: model.type = e_model::MODEL_3B; break;
|
@@ -4104,7 +4202,7 @@ static void llm_load_hparams(
|
|
4104
4202
|
model.ftype = ml.ftype;
|
4105
4203
|
|
4106
4204
|
if (hparams.f_max_alibi_bias > 0.0f) {
|
4107
|
-
hparams.
|
4205
|
+
hparams.use_alibi = true;
|
4108
4206
|
}
|
4109
4207
|
|
4110
4208
|
hparams.rope_type = llama_rope_type(&model);
|
@@ -4127,11 +4225,13 @@ static void llm_load_vocab(
|
|
4127
4225
|
|
4128
4226
|
// determine vocab type
|
4129
4227
|
{
|
4130
|
-
std::string
|
4228
|
+
std::string tokenizer_model;
|
4229
|
+
std::string tokenizer_pre;
|
4131
4230
|
|
4132
|
-
ml.get_key(LLM_KV_TOKENIZER_MODEL,
|
4231
|
+
ml.get_key(LLM_KV_TOKENIZER_MODEL, tokenizer_model);
|
4232
|
+
ml.get_key(LLM_KV_TOKENIZER_PRE, tokenizer_pre, false);
|
4133
4233
|
|
4134
|
-
if (
|
4234
|
+
if (tokenizer_model == "no_vocab") {
|
4135
4235
|
vocab.type = LLAMA_VOCAB_TYPE_NONE;
|
4136
4236
|
|
4137
4237
|
// default special tokens
|
@@ -4145,7 +4245,7 @@ static void llm_load_vocab(
|
|
4145
4245
|
vocab.linefeed_id = -1;
|
4146
4246
|
|
4147
4247
|
return;
|
4148
|
-
} else if (
|
4248
|
+
} else if (tokenizer_model == "llama") {
|
4149
4249
|
vocab.type = LLAMA_VOCAB_TYPE_SPM;
|
4150
4250
|
|
4151
4251
|
// default special tokens
|
@@ -4179,7 +4279,10 @@ static void llm_load_vocab(
|
|
4179
4279
|
vocab.special_prefix_id = 67;
|
4180
4280
|
vocab.special_suffix_id = 69;
|
4181
4281
|
vocab.special_middle_id = 68;
|
4182
|
-
|
4282
|
+
// TODO: this is not EOT, it is "file separator" token, needs fix
|
4283
|
+
// https://huggingface.co/google/codegemma-7b-it/blob/9b1d9231388358c04d90bd003458f5070d97db44/tokenizer_config.json#L565-L572
|
4284
|
+
//vocab.special_eot_id = 70;
|
4285
|
+
vocab.special_eot_id = 107;
|
4183
4286
|
}
|
4184
4287
|
}
|
4185
4288
|
|
@@ -4187,9 +4290,27 @@ static void llm_load_vocab(
|
|
4187
4290
|
if (add_space_prefix_keyidx != -1) {
|
4188
4291
|
vocab.add_space_prefix = gguf_get_val_bool(ctx, add_space_prefix_keyidx);
|
4189
4292
|
} // The default value of add_space_prefix is true.
|
4190
|
-
} else if (
|
4191
|
-
vocab.type =
|
4293
|
+
} else if (tokenizer_model == "bert") {
|
4294
|
+
vocab.type = LLAMA_VOCAB_TYPE_WPM;
|
4192
4295
|
|
4296
|
+
// default special tokens
|
4297
|
+
vocab.special_bos_id = -1;
|
4298
|
+
vocab.special_eos_id = -1;
|
4299
|
+
vocab.special_unk_id = 100;
|
4300
|
+
vocab.special_sep_id = 102;
|
4301
|
+
vocab.special_pad_id = 0;
|
4302
|
+
vocab.special_cls_id = 101;
|
4303
|
+
vocab.special_mask_id = 103;
|
4304
|
+
vocab.add_space_prefix = false;
|
4305
|
+
} else {
|
4306
|
+
if (tokenizer_model == "gpt2") {
|
4307
|
+
vocab.type = LLAMA_VOCAB_TYPE_BPE;
|
4308
|
+
} else {
|
4309
|
+
LLAMA_LOG_WARN("%s: unknown tokenizer: '%s'", __func__, tokenizer_model.c_str());
|
4310
|
+
LLAMA_LOG_WARN("%s: using default tokenizer: 'llama'", __func__);
|
4311
|
+
vocab.type = LLAMA_VOCAB_TYPE_SPM;
|
4312
|
+
return;
|
4313
|
+
}
|
4193
4314
|
// read bpe merges and populate bpe ranks
|
4194
4315
|
const int merges_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_MERGES).c_str());
|
4195
4316
|
if (merges_keyidx == -1) {
|
@@ -4223,23 +4344,50 @@ static void llm_load_vocab(
|
|
4223
4344
|
vocab.special_pad_id = -1;
|
4224
4345
|
vocab.special_cls_id = -1;
|
4225
4346
|
vocab.special_mask_id = -1;
|
4226
|
-
}
|
4227
|
-
vocab.type = LLAMA_VOCAB_TYPE_WPM;
|
4347
|
+
}
|
4228
4348
|
|
4229
|
-
|
4230
|
-
|
4231
|
-
|
4232
|
-
|
4233
|
-
|
4234
|
-
|
4235
|
-
|
4236
|
-
|
4237
|
-
|
4349
|
+
// for now, only BPE models have pre-tokenizers
|
4350
|
+
if (vocab.type == LLAMA_VOCAB_TYPE_BPE) {
|
4351
|
+
if (tokenizer_pre.empty()) {
|
4352
|
+
LLAMA_LOG_WARN("%s: missing pre-tokenizer type, using: 'default'\n", __func__);
|
4353
|
+
LLAMA_LOG_WARN("%s: \n", __func__);
|
4354
|
+
LLAMA_LOG_WARN("%s: ************************************ \n", __func__);
|
4355
|
+
LLAMA_LOG_WARN("%s: GENERATION QUALITY WILL BE DEGRADED! \n", __func__);
|
4356
|
+
LLAMA_LOG_WARN("%s: CONSIDER REGENERATING THE MODEL \n", __func__);
|
4357
|
+
LLAMA_LOG_WARN("%s: ************************************ \n", __func__);
|
4358
|
+
LLAMA_LOG_WARN("%s: \n", __func__);
|
4359
|
+
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
|
4360
|
+
} else if (
|
4361
|
+
tokenizer_pre == "default") {
|
4362
|
+
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
|
4363
|
+
} else if (
|
4364
|
+
tokenizer_pre == "llama3" ||
|
4365
|
+
tokenizer_pre == "llama-v3" ||
|
4366
|
+
tokenizer_pre == "llama-bpe") {
|
4367
|
+
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_LLAMA3;
|
4368
|
+
} else if (
|
4369
|
+
tokenizer_pre == "deepseek-llm") {
|
4370
|
+
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM;
|
4371
|
+
} else if (
|
4372
|
+
tokenizer_pre == "deepseek-coder") {
|
4373
|
+
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER;
|
4374
|
+
} else if (
|
4375
|
+
tokenizer_pre == "falcon") {
|
4376
|
+
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_FALCON;
|
4377
|
+
} else if (
|
4378
|
+
tokenizer_pre == "mpt") {
|
4379
|
+
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_MPT;
|
4380
|
+
} else if (
|
4381
|
+
tokenizer_pre == "starcoder") {
|
4382
|
+
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_STARCODER;
|
4383
|
+
} else if (
|
4384
|
+
tokenizer_pre == "gpt-2") {
|
4385
|
+
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_GPT2;
|
4386
|
+
} else {
|
4387
|
+
throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
|
4388
|
+
}
|
4238
4389
|
} else {
|
4239
|
-
|
4240
|
-
LLAMA_LOG_WARN("%s: using default tokenizer: 'llama'", __func__);
|
4241
|
-
|
4242
|
-
vocab.type = LLAMA_VOCAB_TYPE_SPM;
|
4390
|
+
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
|
4243
4391
|
}
|
4244
4392
|
}
|
4245
4393
|
|
@@ -4308,6 +4456,7 @@ static void llm_load_vocab(
|
|
4308
4456
|
{ LLM_KV_TOKENIZER_MIDDLE_ID, vocab.special_middle_id },
|
4309
4457
|
{ LLM_KV_TOKENIZER_EOT_ID, vocab.special_eot_id },
|
4310
4458
|
};
|
4459
|
+
|
4311
4460
|
for (const auto & it : special_token_types) {
|
4312
4461
|
const std::string & key = kv(std::get<0>(it));
|
4313
4462
|
int32_t & id = std::get<1>(it);
|
@@ -4322,7 +4471,6 @@ static void llm_load_vocab(
|
|
4322
4471
|
} else {
|
4323
4472
|
id = new_id;
|
4324
4473
|
}
|
4325
|
-
|
4326
4474
|
}
|
4327
4475
|
|
4328
4476
|
// Handle add_bos_token and add_eos_token
|
@@ -4336,6 +4484,28 @@ static void llm_load_vocab(
|
|
4336
4484
|
vocab.special_add_eos = int(temp);
|
4337
4485
|
}
|
4338
4486
|
}
|
4487
|
+
|
4488
|
+
// find EOT token: "<|eot_id|>", "<|im_end|>", "<end_of_turn>", etc.
|
4489
|
+
//
|
4490
|
+
// TODO: convert scripts should provide this token through the KV metadata LLAMA_KV_TOKENIZER_EOT_ID
|
4491
|
+
// for now, we apply this workaround to find the EOT token based on its text
|
4492
|
+
if (vocab.special_eot_id == -1) {
|
4493
|
+
for (const auto & t : vocab.token_to_id) {
|
4494
|
+
if (
|
4495
|
+
// TODO: gemma "<end_of_turn>" is exported as a normal token, so the following check does not work
|
4496
|
+
// need to fix convert script
|
4497
|
+
//vocab.id_to_token[t.second].type == LLAMA_TOKEN_TYPE_CONTROL &&
|
4498
|
+
(t.first == "<|eot_id|>" ||
|
4499
|
+
t.first == "<|im_end|>" ||
|
4500
|
+
t.first == "<|end|>" ||
|
4501
|
+
t.first == "<end_of_turn>"
|
4502
|
+
)
|
4503
|
+
) {
|
4504
|
+
vocab.special_eot_id = t.second;
|
4505
|
+
break;
|
4506
|
+
}
|
4507
|
+
}
|
4508
|
+
}
|
4339
4509
|
}
|
4340
4510
|
|
4341
4511
|
// build special tokens cache
|
@@ -4498,14 +4668,19 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
|
|
4498
4668
|
LLAMA_LOG_INFO("%s: general.name = %s\n", __func__, model.name.c_str());
|
4499
4669
|
|
4500
4670
|
// special tokens
|
4501
|
-
if (vocab.special_bos_id
|
4502
|
-
if (vocab.special_eos_id
|
4503
|
-
if (vocab.special_unk_id
|
4504
|
-
if (vocab.special_sep_id
|
4505
|
-
if (vocab.special_pad_id
|
4506
|
-
if (vocab.special_cls_id
|
4507
|
-
if (vocab.special_mask_id
|
4508
|
-
|
4671
|
+
if (vocab.special_bos_id != -1) { LLAMA_LOG_INFO( "%s: BOS token = %d '%s'\n", __func__, vocab.special_bos_id, vocab.id_to_token[vocab.special_bos_id].text.c_str() ); }
|
4672
|
+
if (vocab.special_eos_id != -1) { LLAMA_LOG_INFO( "%s: EOS token = %d '%s'\n", __func__, vocab.special_eos_id, vocab.id_to_token[vocab.special_eos_id].text.c_str() ); }
|
4673
|
+
if (vocab.special_unk_id != -1) { LLAMA_LOG_INFO( "%s: UNK token = %d '%s'\n", __func__, vocab.special_unk_id, vocab.id_to_token[vocab.special_unk_id].text.c_str() ); }
|
4674
|
+
if (vocab.special_sep_id != -1) { LLAMA_LOG_INFO( "%s: SEP token = %d '%s'\n", __func__, vocab.special_sep_id, vocab.id_to_token[vocab.special_sep_id].text.c_str() ); }
|
4675
|
+
if (vocab.special_pad_id != -1) { LLAMA_LOG_INFO( "%s: PAD token = %d '%s'\n", __func__, vocab.special_pad_id, vocab.id_to_token[vocab.special_pad_id].text.c_str() ); }
|
4676
|
+
if (vocab.special_cls_id != -1) { LLAMA_LOG_INFO( "%s: CLS token = %d '%s'\n", __func__, vocab.special_cls_id, vocab.id_to_token[vocab.special_cls_id].text.c_str() ); }
|
4677
|
+
if (vocab.special_mask_id != -1) { LLAMA_LOG_INFO( "%s: MASK token = %d '%s'\n", __func__, vocab.special_mask_id, vocab.id_to_token[vocab.special_mask_id].text.c_str() ); }
|
4678
|
+
|
4679
|
+
if (vocab.linefeed_id != -1) { LLAMA_LOG_INFO( "%s: LF token = %d '%s'\n", __func__, vocab.linefeed_id, vocab.id_to_token[vocab.linefeed_id].text.c_str() ); }
|
4680
|
+
if (vocab.special_prefix_id != -1) { LLAMA_LOG_INFO( "%s: PRE token = %d '%s'\n", __func__, vocab.special_prefix_id, vocab.id_to_token[vocab.special_prefix_id].text.c_str() ); }
|
4681
|
+
if (vocab.special_suffix_id != -1) { LLAMA_LOG_INFO( "%s: SUF token = %d '%s'\n", __func__, vocab.special_suffix_id, vocab.id_to_token[vocab.special_suffix_id].text.c_str() ); }
|
4682
|
+
if (vocab.special_middle_id != -1) { LLAMA_LOG_INFO( "%s: MID token = %d '%s'\n", __func__, vocab.special_middle_id, vocab.id_to_token[vocab.special_middle_id].text.c_str() ); }
|
4683
|
+
if (vocab.special_eot_id != -1) { LLAMA_LOG_INFO( "%s: EOT token = %d '%s'\n", __func__, vocab.special_eot_id, vocab.id_to_token[vocab.special_eot_id].text.c_str() ); }
|
4509
4684
|
}
|
4510
4685
|
|
4511
4686
|
// Returns false if cancelled by progress_callback
|
@@ -5346,6 +5521,33 @@ static bool llm_load_tensors(
|
|
5346
5521
|
layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff});
|
5347
5522
|
}
|
5348
5523
|
} break;
|
5524
|
+
case LLM_ARCH_PHI3:
|
5525
|
+
{
|
5526
|
+
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab });
|
5527
|
+
|
5528
|
+
// output
|
5529
|
+
{
|
5530
|
+
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd });
|
5531
|
+
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), { n_embd, n_vocab });
|
5532
|
+
}
|
5533
|
+
|
5534
|
+
for (int i = 0; i < n_layer; ++i) {
|
5535
|
+
ggml_context* ctx_layer = ctx_for_layer(i);
|
5536
|
+
ggml_context* ctx_split = ctx_for_layer_split(i);
|
5537
|
+
|
5538
|
+
auto& layer = model.layers[i];
|
5539
|
+
|
5540
|
+
layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), { n_embd });
|
5541
|
+
|
5542
|
+
layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), { n_embd, n_embd + 2 * n_embd_gqa }, false);
|
5543
|
+
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd, n_embd });
|
5544
|
+
|
5545
|
+
layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), { n_embd });
|
5546
|
+
|
5547
|
+
layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd });
|
5548
|
+
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), { n_embd, 2 * n_ff });
|
5549
|
+
}
|
5550
|
+
} break;
|
5349
5551
|
case LLM_ARCH_PLAMO:
|
5350
5552
|
{
|
5351
5553
|
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
@@ -5880,7 +6082,7 @@ static bool llm_load_tensors(
|
|
5880
6082
|
// Returns 0 on success, -1 on error, and -2 on cancellation via llama_progress_callback
|
5881
6083
|
static int llama_model_load(const std::string & fname, llama_model & model, llama_model_params & params) {
|
5882
6084
|
try {
|
5883
|
-
llama_model_loader ml(fname, params.use_mmap, params.kv_overrides);
|
6085
|
+
llama_model_loader ml(fname, params.use_mmap, params.check_tensors, params.kv_overrides);
|
5884
6086
|
|
5885
6087
|
model.hparams.vocab_only = params.vocab_only;
|
5886
6088
|
|
@@ -6009,37 +6211,47 @@ static struct ggml_tensor * llm_build_inp_embd(
|
|
6009
6211
|
static void llm_build_kv_store(
|
6010
6212
|
struct ggml_context * ctx,
|
6011
6213
|
const llama_hparams & hparams,
|
6214
|
+
const llama_cparams & cparams,
|
6012
6215
|
const llama_kv_cache & kv,
|
6013
6216
|
struct ggml_cgraph * graph,
|
6014
6217
|
struct ggml_tensor * k_cur,
|
6015
6218
|
struct ggml_tensor * v_cur,
|
6016
|
-
int64_t n_ctx,
|
6017
6219
|
int32_t n_tokens,
|
6018
6220
|
int32_t kv_head,
|
6019
6221
|
const llm_build_cb & cb,
|
6020
6222
|
int64_t il) {
|
6223
|
+
const int64_t n_ctx = cparams.n_ctx;
|
6224
|
+
|
6021
6225
|
const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa();
|
6022
6226
|
const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa();
|
6023
6227
|
|
6024
6228
|
GGML_ASSERT(kv.size == n_ctx);
|
6025
6229
|
|
6026
|
-
// compute the transposed [n_tokens, n_embd] V matrix
|
6027
|
-
assert(v_cur->ne[0] == n_embd_v_gqa && v_cur->ne[1] == n_tokens);
|
6028
|
-
struct ggml_tensor * v_cur_t = ggml_transpose(ctx, v_cur);
|
6029
|
-
cb(v_cur_t, "v_cur_t", il);
|
6030
|
-
|
6031
6230
|
struct ggml_tensor * k_cache_view = ggml_view_1d(ctx, kv.k_l[il], n_tokens*n_embd_k_gqa,
|
6032
6231
|
(ggml_row_size(kv.k_l[il]->type, n_embd_k_gqa))*kv_head);
|
6033
6232
|
cb(k_cache_view, "k_cache_view", il);
|
6034
6233
|
|
6035
|
-
|
6036
|
-
|
6037
|
-
|
6234
|
+
// note: storing RoPE-ed version of K in the KV cache
|
6235
|
+
ggml_build_forward_expand(graph, ggml_cpy(ctx, k_cur, k_cache_view));
|
6236
|
+
|
6237
|
+
assert(v_cur->ne[0] == n_embd_v_gqa && v_cur->ne[1] == n_tokens);
|
6238
|
+
|
6239
|
+
struct ggml_tensor * v_cache_view = nullptr;
|
6240
|
+
|
6241
|
+
if (cparams.flash_attn) {
|
6242
|
+
v_cache_view = ggml_view_1d(ctx, kv.v_l[il], n_tokens*n_embd_v_gqa,
|
6243
|
+
(kv_head)*ggml_row_size(kv.v_l[il]->type, n_embd_v_gqa));
|
6244
|
+
} else {
|
6245
|
+
// note: the V cache is transposed when not using flash attention
|
6246
|
+
v_cache_view = ggml_view_2d(ctx, kv.v_l[il], n_tokens, n_embd_v_gqa,
|
6247
|
+
( n_ctx)*ggml_element_size(kv.v_l[il]),
|
6248
|
+
(kv_head)*ggml_element_size(kv.v_l[il]));
|
6249
|
+
|
6250
|
+
v_cur = ggml_transpose(ctx, v_cur);
|
6251
|
+
}
|
6038
6252
|
cb(v_cache_view, "v_cache_view", il);
|
6039
6253
|
|
6040
|
-
|
6041
|
-
ggml_build_forward_expand(graph, ggml_cpy(ctx, k_cur, k_cache_view));
|
6042
|
-
ggml_build_forward_expand(graph, ggml_cpy(ctx, v_cur_t, v_cache_view));
|
6254
|
+
ggml_build_forward_expand(graph, ggml_cpy(ctx, v_cur, v_cache_view));
|
6043
6255
|
}
|
6044
6256
|
|
6045
6257
|
static struct ggml_tensor * llm_build_norm(
|
@@ -6259,11 +6471,11 @@ static struct ggml_tensor * llm_build_moe_ffn(
|
|
6259
6471
|
return moe_out;
|
6260
6472
|
}
|
6261
6473
|
|
6262
|
-
// if max_alibi_bias > 0 then apply ALiBi
|
6263
6474
|
static struct ggml_tensor * llm_build_kqv(
|
6264
6475
|
struct ggml_context * ctx,
|
6265
6476
|
const llama_model & model,
|
6266
6477
|
const llama_hparams & hparams,
|
6478
|
+
const llama_cparams & cparams,
|
6267
6479
|
const llama_kv_cache & kv,
|
6268
6480
|
struct ggml_cgraph * graph,
|
6269
6481
|
struct ggml_tensor * wo,
|
@@ -6271,12 +6483,12 @@ static struct ggml_tensor * llm_build_kqv(
|
|
6271
6483
|
struct ggml_tensor * q_cur,
|
6272
6484
|
struct ggml_tensor * kq_mask,
|
6273
6485
|
struct ggml_tensor * kq_pos,
|
6274
|
-
int64_t n_ctx,
|
6275
6486
|
int32_t n_tokens,
|
6276
6487
|
int32_t n_kv,
|
6277
6488
|
float kq_scale,
|
6278
6489
|
const llm_build_cb & cb,
|
6279
6490
|
int il) {
|
6491
|
+
const int64_t n_ctx = cparams.n_ctx;
|
6280
6492
|
const int64_t n_head = hparams.n_head;
|
6281
6493
|
const int64_t n_head_kv = hparams.n_head_kv;
|
6282
6494
|
const int64_t n_embd_head_k = hparams.n_embd_head_k;
|
@@ -6294,71 +6506,99 @@ static struct ggml_tensor * llm_build_kqv(
|
|
6294
6506
|
0);
|
6295
6507
|
cb(k, "k", il);
|
6296
6508
|
|
6297
|
-
struct ggml_tensor *
|
6298
|
-
cb(kq, "kq", il);
|
6509
|
+
struct ggml_tensor * cur;
|
6299
6510
|
|
6300
|
-
if (
|
6301
|
-
|
6302
|
-
|
6303
|
-
ggml_mul_mat_set_prec(kq, GGML_PREC_F32);
|
6304
|
-
}
|
6511
|
+
if (cparams.flash_attn) {
|
6512
|
+
GGML_UNUSED(model);
|
6513
|
+
GGML_UNUSED(n_ctx);
|
6305
6514
|
|
6306
|
-
|
6307
|
-
//
|
6308
|
-
|
6309
|
-
// and then :
|
6310
|
-
// kq = 30 * tanh(kq / 30)
|
6311
|
-
// before the softmax below
|
6515
|
+
// note: if this assert triggers, then some check has failed earlier
|
6516
|
+
// the idea is to detect during context creation that ALiBi would be used and disable Flash Attention
|
6517
|
+
GGML_ASSERT(kq_pos == nullptr && "ALiBi is not yet supported with Flash Attention");
|
6312
6518
|
|
6313
|
-
//
|
6314
|
-
|
6519
|
+
// split cached v into n_head heads (not transposed)
|
6520
|
+
struct ggml_tensor * v =
|
6521
|
+
ggml_view_3d(ctx, kv.v_l[il],
|
6522
|
+
n_embd_head_v, n_kv, n_head_kv,
|
6523
|
+
ggml_row_size(kv.v_l[il]->type, n_embd_k_gqa),
|
6524
|
+
ggml_row_size(kv.v_l[il]->type, n_embd_head_k),
|
6525
|
+
0);
|
6526
|
+
cb(v, "v", il);
|
6315
6527
|
|
6316
|
-
|
6317
|
-
|
6318
|
-
|
6528
|
+
cur = ggml_flash_attn_ext(ctx, q, k, v, kq_mask, kq_scale);
|
6529
|
+
|
6530
|
+
if (model.arch == LLM_ARCH_PHI2 || model.arch == LLM_ARCH_PHI3) {
|
6531
|
+
ggml_flash_attn_ext_set_prec(cur, GGML_PREC_F32);
|
6532
|
+
}
|
6533
|
+
|
6534
|
+
cur = ggml_reshape_2d(ctx, cur, n_embd_head_k*n_head, n_tokens);
|
6535
|
+
} else {
|
6536
|
+
struct ggml_tensor * kq = ggml_mul_mat(ctx, k, q);
|
6537
|
+
cb(kq, "kq", il);
|
6538
|
+
|
6539
|
+
if (model.arch == LLM_ARCH_PHI2 || model.arch == LLM_ARCH_PHI3) {
|
6540
|
+
// for this arch, we need to perform the KQ multiplication with F32 precision, otherwise we get NaNs
|
6541
|
+
// ref: https://github.com/ggerganov/llama.cpp/pull/4490#issuecomment-1859055847
|
6542
|
+
ggml_mul_mat_set_prec(kq, GGML_PREC_F32);
|
6543
|
+
}
|
6544
|
+
|
6545
|
+
if (model.arch == LLM_ARCH_GROK) {
|
6546
|
+
// need to do the following:
|
6547
|
+
// multiply by attn_output_multiplyer of 0.08838834764831845
|
6548
|
+
// and then :
|
6549
|
+
// kq = 30 * tanh(kq / 30)
|
6550
|
+
// before the softmax below
|
6551
|
+
|
6552
|
+
//try from phi2
|
6553
|
+
//ggml_mul_mat_set_prec(kq, GGML_PREC_F32);
|
6554
|
+
|
6555
|
+
kq = ggml_tanh(ctx, ggml_scale(ctx, kq, 0.08838834764831845f/30.0f));
|
6556
|
+
kq = ggml_scale(ctx, kq, 30);
|
6557
|
+
}
|
6319
6558
|
|
6320
6559
|
#if defined(GGML_USE_KOMPUTE)
|
6321
6560
|
#pragma message("TODO: ALiBi support in ggml_soft_max_ext is not implemented for Kompute")
|
6322
6561
|
#pragma message(" Falling back to ggml_alibi(). Will become an error in Mar 2024")
|
6323
6562
|
#pragma message("ref: https://github.com/ggerganov/llama.cpp/pull/5488")
|
6324
|
-
|
6325
|
-
|
6326
|
-
|
6563
|
+
if (hparams.use_alibi) {
|
6564
|
+
kq = ggml_scale(ctx, kq, kq_scale);
|
6565
|
+
cb(kq, "kq_scaled", il);
|
6327
6566
|
|
6328
|
-
|
6329
|
-
|
6567
|
+
kq = ggml_alibi(ctx, kq, /*n_past*/ 0, n_head, hparams.f_max_alibi_bias);
|
6568
|
+
cb(kq, "kq_scaled_alibi", il);
|
6330
6569
|
|
6331
|
-
|
6332
|
-
|
6570
|
+
kq = ggml_add(ctx, kq, kq_mask);
|
6571
|
+
cb(kq, "kq_masked", il);
|
6333
6572
|
|
6334
|
-
|
6335
|
-
|
6336
|
-
|
6573
|
+
kq = ggml_soft_max(ctx, kq);
|
6574
|
+
cb(kq, "kq_soft_max", il);
|
6575
|
+
} else
|
6337
6576
|
#endif
|
6338
|
-
|
6339
|
-
|
6340
|
-
|
6341
|
-
|
6577
|
+
{
|
6578
|
+
kq = ggml_soft_max_ext(ctx, kq, kq_mask, kq_pos, kq_scale, hparams.f_max_alibi_bias);
|
6579
|
+
cb(kq, "kq_soft_max_ext", il);
|
6580
|
+
}
|
6342
6581
|
|
6343
|
-
|
6582
|
+
GGML_ASSERT(kv.size == n_ctx);
|
6344
6583
|
|
6345
|
-
|
6346
|
-
|
6347
|
-
|
6348
|
-
|
6349
|
-
|
6350
|
-
|
6351
|
-
|
6352
|
-
|
6584
|
+
// split cached v into n_head heads
|
6585
|
+
struct ggml_tensor * v =
|
6586
|
+
ggml_view_3d(ctx, kv.v_l[il],
|
6587
|
+
n_kv, n_embd_head_v, n_head_kv,
|
6588
|
+
ggml_element_size(kv.v_l[il])*n_ctx,
|
6589
|
+
ggml_element_size(kv.v_l[il])*n_ctx*n_embd_head_v,
|
6590
|
+
0);
|
6591
|
+
cb(v, "v", il);
|
6353
6592
|
|
6354
|
-
|
6355
|
-
|
6593
|
+
struct ggml_tensor * kqv = ggml_mul_mat(ctx, v, kq);
|
6594
|
+
cb(kqv, "kqv", il);
|
6356
6595
|
|
6357
|
-
|
6358
|
-
|
6596
|
+
struct ggml_tensor * kqv_merged = ggml_permute(ctx, kqv, 0, 2, 1, 3);
|
6597
|
+
cb(kqv_merged, "kqv_merged", il);
|
6359
6598
|
|
6360
|
-
|
6361
|
-
|
6599
|
+
cur = ggml_cont_2d(ctx, kqv_merged, n_embd_head_k*n_head, n_tokens);
|
6600
|
+
cb(cur, "kqv_merged_cont", il);
|
6601
|
+
}
|
6362
6602
|
|
6363
6603
|
ggml_build_forward_expand(graph, cur);
|
6364
6604
|
|
@@ -6378,6 +6618,7 @@ static struct ggml_tensor * llm_build_kv(
|
|
6378
6618
|
struct ggml_context * ctx,
|
6379
6619
|
const llama_model & model,
|
6380
6620
|
const llama_hparams & hparams,
|
6621
|
+
const llama_cparams & cparams,
|
6381
6622
|
const llama_kv_cache & kv,
|
6382
6623
|
struct ggml_cgraph * graph,
|
6383
6624
|
struct ggml_tensor * wo,
|
@@ -6387,7 +6628,6 @@ static struct ggml_tensor * llm_build_kv(
|
|
6387
6628
|
struct ggml_tensor * q_cur,
|
6388
6629
|
struct ggml_tensor * kq_mask,
|
6389
6630
|
struct ggml_tensor * kq_pos,
|
6390
|
-
int64_t n_ctx,
|
6391
6631
|
int32_t n_tokens,
|
6392
6632
|
int32_t kv_head,
|
6393
6633
|
int32_t n_kv,
|
@@ -6401,12 +6641,12 @@ static struct ggml_tensor * llm_build_kv(
|
|
6401
6641
|
ggml_build_forward_expand(graph, k_cur);
|
6402
6642
|
ggml_build_forward_expand(graph, v_cur);
|
6403
6643
|
|
6404
|
-
llm_build_kv_store(ctx, hparams, kv, graph, k_cur, v_cur,
|
6644
|
+
llm_build_kv_store(ctx, hparams, cparams, kv, graph, k_cur, v_cur, n_tokens, kv_head, cb, il);
|
6405
6645
|
|
6406
6646
|
struct ggml_tensor * cur;
|
6407
6647
|
|
6408
|
-
cur = llm_build_kqv(ctx, model, hparams, kv, graph, wo, wo_b,
|
6409
|
-
q_cur, kq_mask, kq_pos,
|
6648
|
+
cur = llm_build_kqv(ctx, model, hparams, cparams, kv, graph, wo, wo_b,
|
6649
|
+
q_cur, kq_mask, kq_pos, n_tokens, n_kv, kq_scale, cb, il);
|
6410
6650
|
cb(cur, "kqv_out", il);
|
6411
6651
|
|
6412
6652
|
return cur;
|
@@ -6448,6 +6688,8 @@ struct llm_build_context {
|
|
6448
6688
|
const int32_t kv_head; // index of where we store new KV data in the cache
|
6449
6689
|
const int32_t n_orig_ctx;
|
6450
6690
|
|
6691
|
+
const bool flash_attn;
|
6692
|
+
|
6451
6693
|
const enum llama_pooling_type pooling_type;
|
6452
6694
|
const enum llama_rope_type rope_type;
|
6453
6695
|
|
@@ -6494,6 +6736,7 @@ struct llm_build_context {
|
|
6494
6736
|
n_outputs (worst_case ? n_tokens : lctx.n_outputs),
|
6495
6737
|
kv_head (worst_case ? (kv_self.recurrent ? 0 : kv_self.size - n_tokens) : kv_self.head),
|
6496
6738
|
n_orig_ctx (cparams.n_yarn_orig_ctx),
|
6739
|
+
flash_attn (cparams.flash_attn),
|
6497
6740
|
pooling_type (cparams.pooling_type),
|
6498
6741
|
rope_type (hparams.rope_type),
|
6499
6742
|
cb (cb),
|
@@ -6608,15 +6851,31 @@ struct llm_build_context {
|
|
6608
6851
|
ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa),
|
6609
6852
|
ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*id));
|
6610
6853
|
|
6611
|
-
ggml_tensor * view_v_src
|
6612
|
-
|
6613
|
-
ggml_row_size(kv_self.v_l[il]->type, kv_self.size),
|
6614
|
-
ggml_row_size(kv_self.v_l[il]->type, i));
|
6854
|
+
ggml_tensor * view_v_src;
|
6855
|
+
ggml_tensor * view_v_dst;
|
6615
6856
|
|
6616
|
-
|
6617
|
-
|
6618
|
-
|
6619
|
-
|
6857
|
+
if (flash_attn) {
|
6858
|
+
// NOTE: the V cache is not transposed when using flash attention
|
6859
|
+
view_v_src = ggml_view_2d(ctx0, kv_self.v_l[il],
|
6860
|
+
n_embd_v_gqa, nm,
|
6861
|
+
ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa),
|
6862
|
+
ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa*i));
|
6863
|
+
|
6864
|
+
view_v_dst = ggml_view_2d(ctx0, kv_self.v_l[il],
|
6865
|
+
n_embd_v_gqa, nm,
|
6866
|
+
ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa),
|
6867
|
+
ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa*id));
|
6868
|
+
} else {
|
6869
|
+
view_v_src = ggml_view_2d(ctx0, kv_self.v_l[il],
|
6870
|
+
nm, n_embd_v_gqa,
|
6871
|
+
ggml_row_size(kv_self.v_l[il]->type, kv_self.size),
|
6872
|
+
ggml_row_size(kv_self.v_l[il]->type, i));
|
6873
|
+
|
6874
|
+
view_v_dst = ggml_view_2d(ctx0, kv_self.v_l[il],
|
6875
|
+
nm, n_embd_v_gqa,
|
6876
|
+
ggml_row_size(kv_self.v_l[il]->type, kv_self.size),
|
6877
|
+
ggml_row_size(kv_self.v_l[il]->type, id));
|
6878
|
+
}
|
6620
6879
|
|
6621
6880
|
ggml_build_forward_expand(gf, ggml_cpy(ctx0, view_k_src, view_k_dst));
|
6622
6881
|
ggml_build_forward_expand(gf, ggml_cpy(ctx0, view_v_src, view_v_dst));
|
@@ -6646,20 +6905,26 @@ struct llm_build_context {
|
|
6646
6905
|
|
6647
6906
|
struct ggml_tensor * build_inp_KQ_mask(bool causal = true) {
|
6648
6907
|
if (causal) {
|
6649
|
-
lctx.inp_KQ_mask = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_kv,
|
6908
|
+
lctx.inp_KQ_mask = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_kv, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD));
|
6650
6909
|
} else {
|
6651
|
-
lctx.inp_KQ_mask = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_tokens, n_tokens);
|
6910
|
+
lctx.inp_KQ_mask = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_tokens, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD));
|
6652
6911
|
}
|
6653
6912
|
cb(lctx.inp_KQ_mask, "KQ_mask", -1);
|
6654
6913
|
ggml_set_input(lctx.inp_KQ_mask);
|
6655
|
-
return lctx.inp_KQ_mask;
|
6914
|
+
return flash_attn ? ggml_cast(ctx0, lctx.inp_KQ_mask, GGML_TYPE_F16) : lctx.inp_KQ_mask;
|
6656
6915
|
}
|
6657
6916
|
|
6658
|
-
struct ggml_tensor * build_inp_KQ_pos() {
|
6659
|
-
|
6917
|
+
struct ggml_tensor * build_inp_KQ_pos(bool causal = true) {
|
6918
|
+
if (causal) {
|
6919
|
+
lctx.inp_KQ_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, n_kv);
|
6920
|
+
} else {
|
6921
|
+
// TODO: this will be needed for ALiBi-based BERT models
|
6922
|
+
// https://github.com/ggerganov/llama.cpp/pull/6826
|
6923
|
+
lctx.inp_KQ_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, n_tokens);
|
6924
|
+
}
|
6660
6925
|
cb(lctx.inp_KQ_pos, "KQ_pos", -1);
|
6661
6926
|
ggml_set_input(lctx.inp_KQ_pos);
|
6662
|
-
return lctx.inp_KQ_pos;
|
6927
|
+
return flash_attn ? ggml_cast(ctx0, lctx.inp_KQ_pos, GGML_TYPE_F16) : lctx.inp_KQ_pos;
|
6663
6928
|
}
|
6664
6929
|
|
6665
6930
|
struct ggml_tensor * build_inp_mean() {
|
@@ -6765,9 +7030,9 @@ struct llm_build_context {
|
|
6765
7030
|
);
|
6766
7031
|
cb(Kcur, "Kcur", il);
|
6767
7032
|
|
6768
|
-
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
7033
|
+
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
6769
7034
|
model.layers[il].wo, model.layers[il].bo,
|
6770
|
-
Kcur, Vcur, Qcur, KQ_mask, nullptr,
|
7035
|
+
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
6771
7036
|
}
|
6772
7037
|
|
6773
7038
|
if (il == n_layer - 1) {
|
@@ -6905,9 +7170,9 @@ struct llm_build_context {
|
|
6905
7170
|
cb(Qcur, "Qcur", il);
|
6906
7171
|
cb(Kcur, "Kcur", il);
|
6907
7172
|
|
6908
|
-
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
7173
|
+
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
6909
7174
|
model.layers[il].wo, NULL,
|
6910
|
-
Kcur, Vcur, Qcur, KQ_mask, KQ_pos,
|
7175
|
+
Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
6911
7176
|
}
|
6912
7177
|
|
6913
7178
|
if (il == n_layer - 1) {
|
@@ -7012,9 +7277,9 @@ struct llm_build_context {
|
|
7012
7277
|
ext_factor, attn_factor, beta_fast, beta_slow
|
7013
7278
|
);
|
7014
7279
|
cb(Kcur, "Kcur", il);
|
7015
|
-
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
7280
|
+
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
7016
7281
|
model.layers[il].wo, NULL,
|
7017
|
-
Kcur, Vcur, Qcur, KQ_mask, KQ_pos,
|
7282
|
+
Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
7018
7283
|
}
|
7019
7284
|
|
7020
7285
|
if (il == n_layer - 1) {
|
@@ -7132,9 +7397,9 @@ struct llm_build_context {
|
|
7132
7397
|
);
|
7133
7398
|
cb(Kcur, "Kcur", il);
|
7134
7399
|
|
7135
|
-
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
7400
|
+
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
7136
7401
|
model.layers[il].wo, NULL,
|
7137
|
-
Kcur, Vcur, Qcur, KQ_mask, nullptr,
|
7402
|
+
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
7138
7403
|
}
|
7139
7404
|
|
7140
7405
|
if (il == n_layer - 1) {
|
@@ -7257,9 +7522,9 @@ struct llm_build_context {
|
|
7257
7522
|
);
|
7258
7523
|
cb(Kcur, "Kcur", il);
|
7259
7524
|
|
7260
|
-
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
7525
|
+
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
7261
7526
|
model.layers[il].wo, model.layers[il].bo,
|
7262
|
-
Kcur, Vcur, Qcur, KQ_mask, nullptr,
|
7527
|
+
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f, cb, il);
|
7263
7528
|
}
|
7264
7529
|
|
7265
7530
|
if (il == n_layer - 1) {
|
@@ -7409,9 +7674,9 @@ struct llm_build_context {
|
|
7409
7674
|
);
|
7410
7675
|
cb(Kcur, "Kcur", il);
|
7411
7676
|
|
7412
|
-
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
7413
|
-
|
7414
|
-
|
7677
|
+
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
7678
|
+
model.layers[il].wo, NULL,
|
7679
|
+
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
7415
7680
|
}
|
7416
7681
|
|
7417
7682
|
if (il == n_layer - 1) {
|
@@ -7521,9 +7786,9 @@ struct llm_build_context {
|
|
7521
7786
|
|
7522
7787
|
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
7523
7788
|
|
7524
|
-
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
7789
|
+
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
7525
7790
|
model.layers[il].wo, model.layers[il].bo,
|
7526
|
-
Kcur, Vcur, Qcur, KQ_mask, nullptr,
|
7791
|
+
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
7527
7792
|
}
|
7528
7793
|
|
7529
7794
|
if (il == n_layer - 1) {
|
@@ -7725,9 +7990,9 @@ struct llm_build_context {
|
|
7725
7990
|
);
|
7726
7991
|
cb(Vcur, "Vcur", il);
|
7727
7992
|
|
7728
|
-
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
7993
|
+
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
7729
7994
|
model.layers[il].wo, model.layers[il].bo,
|
7730
|
-
Kcur, Vcur, Q, KQ_mask, nullptr,
|
7995
|
+
Kcur, Vcur, Q, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
7731
7996
|
}
|
7732
7997
|
|
7733
7998
|
if (il == n_layer - 1) {
|
@@ -7821,9 +8086,9 @@ struct llm_build_context {
|
|
7821
8086
|
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
7822
8087
|
cb(Qcur, "Qcur", il);
|
7823
8088
|
|
7824
|
-
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
8089
|
+
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
7825
8090
|
model.layers[il].wo, NULL,
|
7826
|
-
Kcur, Vcur, Qcur, KQ_mask, KQ_pos,
|
8091
|
+
Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
7827
8092
|
}
|
7828
8093
|
|
7829
8094
|
if (il == n_layer - 1) {
|
@@ -8114,9 +8379,9 @@ struct llm_build_context {
|
|
8114
8379
|
|
8115
8380
|
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
8116
8381
|
|
8117
|
-
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
8382
|
+
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
8118
8383
|
model.layers[il].wo, model.layers[il].bo,
|
8119
|
-
Kcur, Vcur, Qcur, KQ_mask, KQ_pos,
|
8384
|
+
Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
8120
8385
|
}
|
8121
8386
|
|
8122
8387
|
if (il == n_layer - 1) {
|
@@ -8245,14 +8510,15 @@ struct llm_build_context {
|
|
8245
8510
|
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
8246
8511
|
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
8247
8512
|
|
8248
|
-
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
8249
|
-
|
8250
|
-
|
8513
|
+
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
8514
|
+
model.layers[il].wo, model.layers[il].bo,
|
8515
|
+
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
8251
8516
|
} else {
|
8252
8517
|
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
8253
|
-
|
8518
|
+
|
8519
|
+
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
8254
8520
|
model.layers[il].wo, model.layers[il].bo,
|
8255
|
-
Kcur, Vcur, Qcur, KQ_mask, KQ_pos,
|
8521
|
+
Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
8256
8522
|
}
|
8257
8523
|
}
|
8258
8524
|
|
@@ -8394,9 +8660,9 @@ struct llm_build_context {
|
|
8394
8660
|
);
|
8395
8661
|
cb(Kcur, "Kcur", il);
|
8396
8662
|
|
8397
|
-
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
8663
|
+
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
8398
8664
|
model.layers[il].wo, NULL,
|
8399
|
-
Kcur, Vcur, Qcur, KQ_mask, nullptr,
|
8665
|
+
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
8400
8666
|
}
|
8401
8667
|
|
8402
8668
|
if (il == n_layer - 1) {
|
@@ -8512,9 +8778,9 @@ struct llm_build_context {
|
|
8512
8778
|
);
|
8513
8779
|
cb(Kcur, "Kcur", il);
|
8514
8780
|
|
8515
|
-
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
8781
|
+
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
8516
8782
|
model.layers[il].wo, NULL,
|
8517
|
-
Kcur, Vcur, Qcur, KQ_mask, nullptr,
|
8783
|
+
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
8518
8784
|
}
|
8519
8785
|
|
8520
8786
|
if (il == n_layer - 1) {
|
@@ -8625,9 +8891,9 @@ struct llm_build_context {
|
|
8625
8891
|
);
|
8626
8892
|
cb(Kcur, "Kcur", il);
|
8627
8893
|
|
8628
|
-
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
8894
|
+
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
8629
8895
|
model.layers[il].wo, model.layers[il].bo,
|
8630
|
-
Kcur, Vcur, Qcur, KQ_mask, nullptr,
|
8896
|
+
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
8631
8897
|
}
|
8632
8898
|
|
8633
8899
|
if (il == n_layer - 1) {
|
@@ -8739,9 +9005,9 @@ struct llm_build_context {
|
|
8739
9005
|
);
|
8740
9006
|
cb(Kcur, "Kcur", il);
|
8741
9007
|
|
8742
|
-
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
9008
|
+
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
8743
9009
|
model.layers[il].wo, model.layers[il].bo,
|
8744
|
-
Kcur, Vcur, Qcur, KQ_mask, nullptr,
|
9010
|
+
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
8745
9011
|
}
|
8746
9012
|
|
8747
9013
|
if (il == n_layer - 1) {
|
@@ -8894,9 +9160,9 @@ struct llm_build_context {
|
|
8894
9160
|
);
|
8895
9161
|
cb(Kcur, "Kcur", il);
|
8896
9162
|
|
8897
|
-
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
9163
|
+
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
8898
9164
|
model.layers[il].wo, model.layers[il].bo,
|
8899
|
-
Kcur, Vcur, Qcur, KQ_mask, nullptr,
|
9165
|
+
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f, cb, il);
|
8900
9166
|
}
|
8901
9167
|
|
8902
9168
|
if (il == n_layer - 1) {
|
@@ -8938,12 +9204,140 @@ struct llm_build_context {
|
|
8938
9204
|
|
8939
9205
|
cur = ggml_add(ctx0, cur, model.output_b);
|
8940
9206
|
cb(cur, "result_output", -1);
|
9207
|
+
ggml_build_forward_expand(gf, cur);
|
9208
|
+
return gf;
|
9209
|
+
}
|
9210
|
+
|
9211
|
+
struct ggml_cgraph * build_phi3() {
|
9212
|
+
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
9213
|
+
|
9214
|
+
const int64_t n_embd_head = hparams.n_embd_head_v;
|
9215
|
+
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
|
9216
|
+
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
9217
|
+
|
9218
|
+
struct ggml_tensor * cur;
|
9219
|
+
struct ggml_tensor * inpL;
|
9220
|
+
|
9221
|
+
inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
|
9222
|
+
|
9223
|
+
// inp_pos - contains the positions
|
9224
|
+
struct ggml_tensor * inp_pos = build_inp_pos();
|
9225
|
+
|
9226
|
+
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
9227
|
+
struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
|
9228
|
+
|
9229
|
+
for (int il = 0; il < n_layer; ++il) {
|
9230
|
+
auto residual = inpL;
|
9231
|
+
|
9232
|
+
// self-attention
|
9233
|
+
{
|
9234
|
+
struct ggml_tensor* attn_norm_output = llm_build_norm(ctx0, inpL, hparams,
|
9235
|
+
model.layers[il].attn_norm,
|
9236
|
+
NULL,
|
9237
|
+
LLM_NORM_RMS, cb, il);
|
9238
|
+
cb(attn_norm_output, "attn_norm", il);
|
9239
|
+
|
9240
|
+
struct ggml_tensor * Qcur = nullptr;
|
9241
|
+
struct ggml_tensor * Kcur = nullptr;
|
9242
|
+
struct ggml_tensor * Vcur = nullptr;
|
9243
|
+
|
9244
|
+
if (model.layers[il].wqkv) {
|
9245
|
+
cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, attn_norm_output);
|
9246
|
+
cb(cur, "wqkv", il);
|
9247
|
+
|
9248
|
+
Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0 * sizeof(float) * (n_embd)));
|
9249
|
+
Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1 * sizeof(float) * (n_embd)));
|
9250
|
+
Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1 * sizeof(float) * (n_embd + n_embd_gqa)));
|
9251
|
+
}
|
9252
|
+
else {
|
9253
|
+
Qcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wq, attn_norm_output), model.layers[il].bq);
|
9254
|
+
Kcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wk, attn_norm_output), model.layers[il].bk);
|
9255
|
+
Vcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wv, attn_norm_output), model.layers[il].bv);
|
9256
|
+
}
|
9257
|
+
|
9258
|
+
cb(Qcur, "Qcur", il);
|
9259
|
+
cb(Kcur, "Kcur", il);
|
9260
|
+
cb(Vcur, "Vcur", il);
|
9261
|
+
|
9262
|
+
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
9263
|
+
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
9264
|
+
|
9265
|
+
Qcur = ggml_rope_custom(
|
9266
|
+
ctx0, Qcur, inp_pos, n_rot, rope_type, 0, n_orig_ctx,
|
9267
|
+
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
9268
|
+
);
|
9269
|
+
cb(Qcur, "Qcur", il);
|
9270
|
+
|
9271
|
+
Qcur = ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd_head)));
|
9272
|
+
cb(Qcur, "Qcur", il);
|
9273
|
+
|
9274
|
+
Kcur = ggml_rope_custom(
|
9275
|
+
ctx0, Kcur, inp_pos, n_rot, rope_type, 0, n_orig_ctx,
|
9276
|
+
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
9277
|
+
);
|
9278
|
+
cb(Kcur, "Kcur", il);
|
9279
|
+
|
9280
|
+
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
9281
|
+
model.layers[il].wo, model.layers[il].bo,
|
9282
|
+
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f, cb, il);
|
9283
|
+
}
|
9284
|
+
|
9285
|
+
if (il == n_layer - 1) {
|
9286
|
+
// skip computing output for unused tokens
|
9287
|
+
struct ggml_tensor* inp_out_ids = build_inp_out_ids();
|
9288
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
9289
|
+
residual = ggml_get_rows(ctx0, residual, inp_out_ids);
|
9290
|
+
}
|
9291
|
+
|
9292
|
+
cur = ggml_add(ctx0, cur, residual);
|
9293
|
+
residual = cur;
|
9294
|
+
|
9295
|
+
cur = llm_build_norm(ctx0, cur, hparams,
|
9296
|
+
model.layers[il].ffn_norm, NULL,
|
9297
|
+
LLM_NORM_RMS, cb, il);
|
9298
|
+
cb(cur, "ffn_norm", il);
|
9299
|
+
|
9300
|
+
// FF
|
9301
|
+
// special-case: the up and gate tensors are merged into a single tensor
|
9302
|
+
// TOOD: support into llm_build_ffn
|
9303
|
+
{
|
9304
|
+
struct ggml_tensor* up = ggml_mul_mat(ctx0, model.layers[il].ffn_up, cur);
|
9305
|
+
cb(up, "ffn_up", il);
|
9306
|
+
|
9307
|
+
auto g = ggml_cont(ctx0, ggml_view_2d(ctx0, up, up->ne[0] / 2, up->ne[1], ggml_row_size(up->type, up->ne[0]), 0));
|
9308
|
+
auto y = ggml_cont(ctx0, ggml_view_2d(ctx0, up, up->ne[0] / 2, up->ne[1], ggml_row_size(up->type, up->ne[0]), up->nb[1] / 2));
|
9309
|
+
|
9310
|
+
y = ggml_mul(ctx0, y, ggml_silu(ctx0, g));
|
9311
|
+
cb(y, "ffn_gate", il);
|
9312
|
+
|
9313
|
+
auto down = ggml_mul_mat(ctx0, model.layers[il].ffn_down, y);
|
9314
|
+
cb(down, "ffn_down", il);
|
9315
|
+
|
9316
|
+
cur = down;
|
9317
|
+
cb(cur, "ffn_out", il);
|
9318
|
+
}
|
9319
|
+
|
9320
|
+
cur = ggml_add(ctx0, residual, cur);
|
9321
|
+
cb(cur, "l_out", il);
|
9322
|
+
|
9323
|
+
inpL = cur;
|
9324
|
+
}
|
9325
|
+
|
9326
|
+
cur = llm_build_norm(ctx0, inpL, hparams,
|
9327
|
+
model.output_norm,
|
9328
|
+
NULL,
|
9329
|
+
LLM_NORM_RMS, cb, -1);
|
9330
|
+
cb(cur, "result_norm", -1);
|
9331
|
+
|
9332
|
+
cur = ggml_mul_mat(ctx0, model.output, cur);
|
9333
|
+
cb(cur, "result_output", -1);
|
8941
9334
|
|
8942
9335
|
ggml_build_forward_expand(gf, cur);
|
8943
9336
|
|
8944
9337
|
return gf;
|
8945
9338
|
}
|
8946
9339
|
|
9340
|
+
|
8947
9341
|
struct ggml_cgraph * build_plamo() {
|
8948
9342
|
struct ggml_cgraph * gf = ggml_new_graph(ctx0);
|
8949
9343
|
|
@@ -8996,9 +9390,9 @@ struct llm_build_context {
|
|
8996
9390
|
ext_factor, attn_factor, beta_fast, beta_slow);
|
8997
9391
|
cb(Kcur, "Kcur", il);
|
8998
9392
|
|
8999
|
-
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
9393
|
+
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
9000
9394
|
model.layers[il].wo, NULL,
|
9001
|
-
Kcur, Vcur, Qcur, KQ_mask, nullptr,
|
9395
|
+
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
9002
9396
|
}
|
9003
9397
|
struct ggml_tensor * sa_out = cur;
|
9004
9398
|
|
@@ -9099,9 +9493,9 @@ struct llm_build_context {
|
|
9099
9493
|
|
9100
9494
|
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
9101
9495
|
|
9102
|
-
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
9496
|
+
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
9103
9497
|
model.layers[il].wo, model.layers[il].bo,
|
9104
|
-
Kcur, Vcur, Qcur, KQ_mask, nullptr,
|
9498
|
+
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
9105
9499
|
}
|
9106
9500
|
|
9107
9501
|
if (il == n_layer - 1) {
|
@@ -9206,9 +9600,9 @@ struct llm_build_context {
|
|
9206
9600
|
);
|
9207
9601
|
cb(Kcur, "Kcur", il);
|
9208
9602
|
|
9209
|
-
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
9603
|
+
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
9210
9604
|
model.layers[il].wo, model.layers[il].bo,
|
9211
|
-
Kcur, Vcur, Qcur, KQ_mask, nullptr,
|
9605
|
+
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
9212
9606
|
}
|
9213
9607
|
|
9214
9608
|
if (il == n_layer - 1) {
|
@@ -9322,9 +9716,9 @@ struct llm_build_context {
|
|
9322
9716
|
);
|
9323
9717
|
cb(Kcur, "Kcur", il);
|
9324
9718
|
|
9325
|
-
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
9719
|
+
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
9326
9720
|
model.layers[il].wo, NULL,
|
9327
|
-
Kcur, Vcur, Qcur, KQ_mask, nullptr,
|
9721
|
+
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
9328
9722
|
}
|
9329
9723
|
|
9330
9724
|
if (il == n_layer - 1) {
|
@@ -9439,9 +9833,9 @@ struct llm_build_context {
|
|
9439
9833
|
);
|
9440
9834
|
cb(Kcur, "Kcur", il);
|
9441
9835
|
|
9442
|
-
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
9836
|
+
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
9443
9837
|
model.layers[il].wo, model.layers[il].bo,
|
9444
|
-
Kcur, Vcur, Qcur, KQ_mask, nullptr,
|
9838
|
+
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
9445
9839
|
}
|
9446
9840
|
|
9447
9841
|
if (il == n_layer - 1) {
|
@@ -9569,9 +9963,9 @@ struct llm_build_context {
|
|
9569
9963
|
);
|
9570
9964
|
cb(Kcur, "Kcur", il);
|
9571
9965
|
|
9572
|
-
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
9966
|
+
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
9573
9967
|
model.layers[il].wo, model.layers[il].bo,
|
9574
|
-
Kcur, Vcur, Qcur, KQ_mask, nullptr,
|
9968
|
+
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
9575
9969
|
}
|
9576
9970
|
|
9577
9971
|
if (il == n_layer - 1) {
|
@@ -9690,9 +10084,9 @@ struct llm_build_context {
|
|
9690
10084
|
ext_factor, attn_factor, beta_fast, beta_slow);
|
9691
10085
|
cb(Kcur, "Kcur", il);
|
9692
10086
|
|
9693
|
-
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
10087
|
+
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
9694
10088
|
model.layers[il].wo, NULL,
|
9695
|
-
Kcur, Vcur, Qcur, KQ_mask, nullptr,
|
10089
|
+
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f, cb, il);
|
9696
10090
|
}
|
9697
10091
|
|
9698
10092
|
if (il == n_layer - 1) {
|
@@ -9809,9 +10203,9 @@ struct llm_build_context {
|
|
9809
10203
|
);
|
9810
10204
|
cb(Kcur, "Kcur", il);
|
9811
10205
|
|
9812
|
-
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
10206
|
+
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
9813
10207
|
model.layers[il].wo, model.layers[il].bo,
|
9814
|
-
Kcur, Vcur, Qcur, KQ_mask, nullptr,
|
10208
|
+
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
9815
10209
|
}
|
9816
10210
|
|
9817
10211
|
if (il == n_layer - 1) {
|
@@ -10099,9 +10493,9 @@ struct llm_build_context {
|
|
10099
10493
|
);
|
10100
10494
|
cb(Kcur, "Kcur", il);
|
10101
10495
|
|
10102
|
-
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
10496
|
+
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
10103
10497
|
model.layers[il].wo, model.layers[il].bo,
|
10104
|
-
Kcur, Vcur, Qcur, KQ_mask, nullptr,
|
10498
|
+
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
10105
10499
|
}
|
10106
10500
|
|
10107
10501
|
if (il == n_layer - 1) {
|
@@ -10230,9 +10624,9 @@ struct llm_build_context {
|
|
10230
10624
|
);
|
10231
10625
|
cb(Kcur, "Kcur", il);
|
10232
10626
|
|
10233
|
-
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
10627
|
+
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
10234
10628
|
model.layers[il].wo, nullptr,
|
10235
|
-
Kcur, Vcur, Qcur, KQ_mask, nullptr,
|
10629
|
+
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
10236
10630
|
}
|
10237
10631
|
|
10238
10632
|
if (il == n_layer - 1) {
|
@@ -10445,6 +10839,10 @@ static struct ggml_cgraph * llama_build_graph(
|
|
10445
10839
|
{
|
10446
10840
|
result = llm.build_phi2();
|
10447
10841
|
} break;
|
10842
|
+
case LLM_ARCH_PHI3:
|
10843
|
+
{
|
10844
|
+
result = llm.build_phi3();
|
10845
|
+
} break;
|
10448
10846
|
case LLM_ARCH_PLAMO:
|
10449
10847
|
{
|
10450
10848
|
result = llm.build_plamo();
|
@@ -10655,7 +11053,9 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
|
|
10655
11053
|
}
|
10656
11054
|
}
|
10657
11055
|
|
10658
|
-
|
11056
|
+
// ALiBi requires the KQ_pos tensor to provide the sequence position of each token in the batch
|
11057
|
+
// this allows to process multiple sequences in parallel with ALiBi-based models
|
11058
|
+
if (hparams.use_alibi) {
|
10659
11059
|
const int64_t n_kv = kv_self.n;
|
10660
11060
|
|
10661
11061
|
GGML_ASSERT(lctx.inp_KQ_pos);
|
@@ -11037,7 +11437,7 @@ static int llama_decode_internal(
|
|
11037
11437
|
// a heuristic, to avoid attending the full cache if it is not yet utilized
|
11038
11438
|
// after enough generations, the benefit from this heuristic disappears
|
11039
11439
|
// if we start defragmenting the cache, the benefit from this will be more important
|
11040
|
-
kv_self.n = std::min(kv_self.size, std::max(
|
11440
|
+
kv_self.n = std::min(kv_self.size, std::max(256u, GGML_PAD(llama_kv_cache_cell_max(kv_self), 256)));
|
11041
11441
|
//kv_self.n = llama_kv_cache_cell_max(kv_self);
|
11042
11442
|
}
|
11043
11443
|
}
|
@@ -11205,6 +11605,10 @@ static int llama_decode_internal(
|
|
11205
11605
|
}
|
11206
11606
|
}
|
11207
11607
|
|
11608
|
+
// Reset state for the next token before backend sync, to allow the CPU activities in the reset to
|
11609
|
+
// overlap with device computation.
|
11610
|
+
ggml_backend_sched_reset(lctx.sched);
|
11611
|
+
|
11208
11612
|
return 0;
|
11209
11613
|
}
|
11210
11614
|
|
@@ -11230,7 +11634,9 @@ static void llama_kv_cache_defrag_internal(struct llama_context & lctx) {
|
|
11230
11634
|
// each move requires 6*n_layer tensors (see build_defrag)
|
11231
11635
|
// - source view, destination view, copy operation
|
11232
11636
|
// - x2 for keys and values
|
11233
|
-
const uint32_t max_moves = LLAMA_MAX_NODES/(6*n_layer);
|
11637
|
+
//const uint32_t max_moves = LLAMA_MAX_NODES/(6*n_layer);
|
11638
|
+
// TODO: tmp fix https://github.com/ggerganov/llama.cpp/issues/6685#issuecomment-2057579516
|
11639
|
+
const uint32_t max_moves = (LLAMA_MAX_NODES - 2*n_layer)/(6*n_layer);
|
11234
11640
|
|
11235
11641
|
// determine which KV cells to move where
|
11236
11642
|
//
|
@@ -11554,7 +11960,7 @@ static uint8_t llama_token_to_byte(const llama_vocab& vocab, llama_token id) {
|
|
11554
11960
|
}
|
11555
11961
|
case LLAMA_VOCAB_TYPE_BPE: {
|
11556
11962
|
GGML_ASSERT(false);
|
11557
|
-
return unicode_utf8_to_byte(token_data.text);
|
11963
|
+
return unicode_utf8_to_byte(token_data.text); // TODO: why is this here after GGML_ASSERT?
|
11558
11964
|
}
|
11559
11965
|
case LLAMA_VOCAB_TYPE_WPM: {
|
11560
11966
|
GGML_ASSERT(false);
|
@@ -11776,7 +12182,79 @@ struct llm_tokenizer_bpe {
|
|
11776
12182
|
|
11777
12183
|
void tokenize(const std::string & text, std::vector<llama_vocab::id> & output) {
|
11778
12184
|
int final_prev_index = -1;
|
11779
|
-
|
12185
|
+
|
12186
|
+
std::vector<std::string> word_collection;
|
12187
|
+
switch (vocab.type) {
|
12188
|
+
case LLAMA_VOCAB_TYPE_BPE:
|
12189
|
+
switch (vocab.type_pre) {
|
12190
|
+
case LLAMA_VOCAB_PRE_TYPE_LLAMA3:
|
12191
|
+
word_collection = unicode_regex_split(text, {
|
12192
|
+
// original regex from tokenizer.json
|
12193
|
+
//"(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
|
12194
|
+
|
12195
|
+
// adapted: https://github.com/ggerganov/llama.cpp/pull/6920#issuecomment-2080233989
|
12196
|
+
"(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
|
12197
|
+
});
|
12198
|
+
break;
|
12199
|
+
case LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM:
|
12200
|
+
word_collection = unicode_regex_split(text, {
|
12201
|
+
"[\r\n]",
|
12202
|
+
"\\s?[A-Za-zµÀ-ÖØ-öø-ƺƼ-ƿDŽ-ʓʕ-ʯͰ-ͳͶͷͻ-ͽͿΆΈ-ΊΌΎ-ΡΣ-ϵϷ-ҁҊ-ԯԱ-ՖႠ-ჅᎠ-Ᏽᏸ-ᏽᲐ-ᲺᲽ-Ჿᴀ-ᴫᵫ-ᵷᵹ-ᶚḀ-ἕἘ-Ἕἠ-ὅὈ-Ὅὐ-ὗὙὛὝὟ-ώᾀ-ᾴᾶ-ᾼιῂ-ῄῆ-ῌῐ-ΐῖ-Ίῠ-Ῥῲ-ῴῶ-ῼℂℇℊ-ℓℕℙ-ℝℤΩℨK-ℭℯ-ℴℹℼ-ℿⅅ-ⅉⅎↃↄⰀ-ⱻⱾ-ⳤⳫ-ⳮⳲⳳꙀ-ꙭꚀ-ꚛꜢ-ꝯꝱ-ꞇꞋ-ꞎꭰ-ꮿff-stﬓ-ﬗA-Za-z𐐀-𐑏𐒰-𐓓𐓘-𐓻𐲀-𐲲𐳀-𐳲𑢠-𑣟𞤀-𞥃]+",
|
12203
|
+
"\\s?[!-/:-~!-/:-~‘-‟ -。]+",
|
12204
|
+
"\\s+$",
|
12205
|
+
"[一-龥ࠀ-一가-]+",
|
12206
|
+
"\\p{N}+",
|
12207
|
+
});
|
12208
|
+
break;
|
12209
|
+
case LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER:
|
12210
|
+
word_collection = unicode_regex_split(text, {
|
12211
|
+
"[\r\n]",
|
12212
|
+
"\\s?\\p{L}+",
|
12213
|
+
"\\s?\\p{P}+",
|
12214
|
+
"[一-龥ࠀ-一가-]+",
|
12215
|
+
"\\p{N}+",
|
12216
|
+
});
|
12217
|
+
break;
|
12218
|
+
case LLAMA_VOCAB_PRE_TYPE_FALCON:
|
12219
|
+
word_collection = unicode_regex_split(text, {
|
12220
|
+
"[\\p{P}\\$\\+<=>\\^~\\|]+",
|
12221
|
+
"'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
|
12222
|
+
"\\p{N}+",
|
12223
|
+
"[0-9][0-9][0-9]",
|
12224
|
+
});
|
12225
|
+
break;
|
12226
|
+
case LLAMA_VOCAB_PRE_TYPE_MPT:
|
12227
|
+
// TODO: MPT pre-tokenization regexes are unknown
|
12228
|
+
// the following are close, but not exact. run the following:
|
12229
|
+
// ./bin/test-tokenizer-0 ../models/ggml-vocab-mpt.gguf
|
12230
|
+
GGML_ASSERT("MPT pre-tokenization regexes are unknown - fixes needed");
|
12231
|
+
word_collection = unicode_regex_split(text, {
|
12232
|
+
"\\s?\\p{L}+",
|
12233
|
+
"\\s?\\p{P}+",
|
12234
|
+
"'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
|
12235
|
+
});
|
12236
|
+
break;
|
12237
|
+
case LLAMA_VOCAB_PRE_TYPE_STARCODER:
|
12238
|
+
case LLAMA_VOCAB_PRE_TYPE_GPT2:
|
12239
|
+
word_collection = unicode_regex_split(text, {
|
12240
|
+
"'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
|
12241
|
+
});
|
12242
|
+
break;
|
12243
|
+
default:
|
12244
|
+
// default regex for BPE tokenization pre-processing
|
12245
|
+
word_collection = unicode_regex_split(text, {
|
12246
|
+
"[\\p{P}\\$\\+<=>\\^~\\|]+",
|
12247
|
+
"'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
|
12248
|
+
"\\p{N}+",
|
12249
|
+
"[0-9][0-9][0-9]",
|
12250
|
+
});
|
12251
|
+
break;
|
12252
|
+
}
|
12253
|
+
break;
|
12254
|
+
default:
|
12255
|
+
GGML_ASSERT(false);
|
12256
|
+
break;
|
12257
|
+
}
|
11780
12258
|
|
11781
12259
|
symbols_final.clear();
|
11782
12260
|
|
@@ -11903,145 +12381,6 @@ private:
|
|
11903
12381
|
work_queue.push(bigram);
|
11904
12382
|
}
|
11905
12383
|
|
11906
|
-
std::vector<std::string> bpe_gpt2_preprocess(const std::string & text) {
|
11907
|
-
std::vector<std::string> bpe_words;
|
11908
|
-
std::vector<std::string> bpe_encoded_words;
|
11909
|
-
|
11910
|
-
std::string token = "";
|
11911
|
-
// GPT2 system regex: 's|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+
|
11912
|
-
bool collecting_numeric = false;
|
11913
|
-
bool collecting_letter = false;
|
11914
|
-
bool collecting_special = false;
|
11915
|
-
bool collecting_whitespace_lookahead = false;
|
11916
|
-
bool collecting = false;
|
11917
|
-
|
11918
|
-
std::vector<std::string> text_utf;
|
11919
|
-
text_utf.reserve(text.size());
|
11920
|
-
bpe_words.reserve(text.size());
|
11921
|
-
bpe_encoded_words.reserve(text.size());
|
11922
|
-
|
11923
|
-
const auto cpts = unicode_cpts_from_utf8(text);
|
11924
|
-
for (size_t i = 0; i < cpts.size(); ++i)
|
11925
|
-
text_utf.emplace_back(unicode_cpt_to_utf8(cpts[i]));
|
11926
|
-
|
11927
|
-
for (int i = 0; i < (int)text_utf.size(); i++) {
|
11928
|
-
const std::string & utf_char = text_utf[i];
|
11929
|
-
bool split_condition = false;
|
11930
|
-
int bytes_remain = text_utf.size() - i;
|
11931
|
-
// forward backward lookups
|
11932
|
-
const std::string & utf_char_next = (i + 1 < (int)text_utf.size()) ? text_utf[i + 1] : "";
|
11933
|
-
const std::string & utf_char_next_next = (i + 2 < (int)text_utf.size()) ? text_utf[i + 2] : "";
|
11934
|
-
|
11935
|
-
// handling contractions
|
11936
|
-
if (!split_condition && bytes_remain >= 2) {
|
11937
|
-
// 's|'t|'m|'d
|
11938
|
-
if (utf_char == "\'" && (utf_char_next == "s" || utf_char_next == "t" || utf_char_next == "m" || utf_char_next == "d")) {
|
11939
|
-
split_condition = true;
|
11940
|
-
}
|
11941
|
-
if (split_condition) {
|
11942
|
-
if (token.size()) {
|
11943
|
-
bpe_words.emplace_back(token); // push previous content as token
|
11944
|
-
}
|
11945
|
-
token = utf_char + utf_char_next;
|
11946
|
-
bpe_words.emplace_back(token);
|
11947
|
-
token = "";
|
11948
|
-
i++;
|
11949
|
-
continue;
|
11950
|
-
}
|
11951
|
-
}
|
11952
|
-
if (!split_condition && bytes_remain >= 3) {
|
11953
|
-
// 're|'ve|'ll
|
11954
|
-
if (utf_char == "\'" && (
|
11955
|
-
(utf_char_next == "r" && utf_char_next_next == "e") ||
|
11956
|
-
(utf_char_next == "v" && utf_char_next_next == "e") ||
|
11957
|
-
(utf_char_next == "l" && utf_char_next_next == "l"))
|
11958
|
-
) {
|
11959
|
-
split_condition = true;
|
11960
|
-
}
|
11961
|
-
if (split_condition) {
|
11962
|
-
// current token + next token can be defined
|
11963
|
-
if (token.size()) {
|
11964
|
-
bpe_words.emplace_back(token); // push previous content as token
|
11965
|
-
}
|
11966
|
-
token = utf_char + utf_char_next + utf_char_next_next;
|
11967
|
-
bpe_words.emplace_back(token); // the contraction
|
11968
|
-
token = "";
|
11969
|
-
i += 2;
|
11970
|
-
continue;
|
11971
|
-
}
|
11972
|
-
}
|
11973
|
-
|
11974
|
-
if (!split_condition && !collecting) {
|
11975
|
-
if (unicode_cpt_type(utf_char) == CODEPOINT_TYPE_LETTER || (!token.size() && utf_char == " " && unicode_cpt_type(utf_char_next) == CODEPOINT_TYPE_LETTER)) {
|
11976
|
-
collecting_letter = true;
|
11977
|
-
collecting = true;
|
11978
|
-
}
|
11979
|
-
else if (unicode_cpt_type(utf_char) == CODEPOINT_TYPE_DIGIT || (!token.size() && utf_char == " " && unicode_cpt_type(utf_char_next) == CODEPOINT_TYPE_DIGIT)) {
|
11980
|
-
collecting_numeric = true;
|
11981
|
-
collecting = true;
|
11982
|
-
}
|
11983
|
-
else if (
|
11984
|
-
((unicode_cpt_type(utf_char) != CODEPOINT_TYPE_LETTER && unicode_cpt_type(utf_char) != CODEPOINT_TYPE_DIGIT) && (unicode_cpt_type(utf_char) != CODEPOINT_TYPE_WHITESPACE)) ||
|
11985
|
-
(!token.size() && utf_char == " " && unicode_cpt_type(utf_char_next) != CODEPOINT_TYPE_LETTER && unicode_cpt_type(utf_char_next) != CODEPOINT_TYPE_DIGIT && unicode_cpt_type(utf_char_next) != CODEPOINT_TYPE_WHITESPACE)
|
11986
|
-
) {
|
11987
|
-
collecting_special = true;
|
11988
|
-
collecting = true;
|
11989
|
-
}
|
11990
|
-
else if (unicode_cpt_type(utf_char) == CODEPOINT_TYPE_WHITESPACE && unicode_cpt_type(utf_char_next) == CODEPOINT_TYPE_WHITESPACE) {
|
11991
|
-
collecting_whitespace_lookahead = true;
|
11992
|
-
collecting = true;
|
11993
|
-
}
|
11994
|
-
else if (unicode_cpt_type(utf_char) == CODEPOINT_TYPE_WHITESPACE) {
|
11995
|
-
split_condition = true;
|
11996
|
-
}
|
11997
|
-
}
|
11998
|
-
else if (!split_condition && collecting) {
|
11999
|
-
if (collecting_letter && unicode_cpt_type(utf_char) != CODEPOINT_TYPE_LETTER) {
|
12000
|
-
split_condition = true;
|
12001
|
-
}
|
12002
|
-
else if (collecting_numeric && unicode_cpt_type(utf_char) != CODEPOINT_TYPE_DIGIT) {
|
12003
|
-
split_condition = true;
|
12004
|
-
}
|
12005
|
-
else if (collecting_special && (unicode_cpt_type(utf_char) == CODEPOINT_TYPE_LETTER || unicode_cpt_type(utf_char) == CODEPOINT_TYPE_DIGIT || unicode_cpt_type(utf_char) == CODEPOINT_TYPE_WHITESPACE)) {
|
12006
|
-
split_condition = true;
|
12007
|
-
}
|
12008
|
-
else if (collecting_whitespace_lookahead && (unicode_cpt_type(utf_char_next) == CODEPOINT_TYPE_LETTER || unicode_cpt_type(utf_char_next) == CODEPOINT_TYPE_DIGIT)) {
|
12009
|
-
split_condition = true;
|
12010
|
-
}
|
12011
|
-
}
|
12012
|
-
|
12013
|
-
if (utf_char_next == "") {
|
12014
|
-
split_condition = true; // final
|
12015
|
-
token += utf_char;
|
12016
|
-
}
|
12017
|
-
|
12018
|
-
if (split_condition) {
|
12019
|
-
if (token.size()) {
|
12020
|
-
bpe_words.emplace_back(token);
|
12021
|
-
}
|
12022
|
-
token = utf_char;
|
12023
|
-
collecting = false;
|
12024
|
-
collecting_letter = false;
|
12025
|
-
collecting_numeric = false;
|
12026
|
-
collecting_special = false;
|
12027
|
-
collecting_whitespace_lookahead = false;
|
12028
|
-
}
|
12029
|
-
else {
|
12030
|
-
token += utf_char;
|
12031
|
-
}
|
12032
|
-
}
|
12033
|
-
|
12034
|
-
for (std::string & word : bpe_words) {
|
12035
|
-
std::string encoded_token = "";
|
12036
|
-
for (char & c : word) {
|
12037
|
-
encoded_token += unicode_byte_to_utf8(c);
|
12038
|
-
}
|
12039
|
-
bpe_encoded_words.emplace_back(encoded_token);
|
12040
|
-
}
|
12041
|
-
|
12042
|
-
return bpe_encoded_words;
|
12043
|
-
}
|
12044
|
-
|
12045
12384
|
const llama_vocab & vocab;
|
12046
12385
|
|
12047
12386
|
std::vector<llm_symbol> symbols;
|
@@ -12361,7 +12700,7 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
|
|
12361
12700
|
} break;
|
12362
12701
|
case LLAMA_VOCAB_TYPE_BPE:
|
12363
12702
|
{
|
12364
|
-
if (add_special && vocab.special_add_bos
|
12703
|
+
if (add_special && vocab.special_add_bos != 0) {
|
12365
12704
|
GGML_ASSERT(vocab.special_bos_id != -1);
|
12366
12705
|
output.push_back(vocab.special_bos_id);
|
12367
12706
|
}
|
@@ -13268,16 +13607,14 @@ void llama_sample_grammar(struct llama_context * ctx, llama_token_data_array * c
|
|
13268
13607
|
GGML_ASSERT(ctx);
|
13269
13608
|
const int64_t t_start_sample_us = ggml_time_us();
|
13270
13609
|
|
13271
|
-
bool
|
13610
|
+
bool allow_eog = false;
|
13272
13611
|
for (const auto & stack : grammar->stacks) {
|
13273
13612
|
if (stack.empty()) {
|
13274
|
-
|
13613
|
+
allow_eog = true;
|
13275
13614
|
break;
|
13276
13615
|
}
|
13277
13616
|
}
|
13278
13617
|
|
13279
|
-
const llama_token eos = llama_token_eos(&ctx->model);
|
13280
|
-
|
13281
13618
|
std::vector<std::pair<std::vector<uint32_t>, llama_partial_utf8>> candidates_decoded;
|
13282
13619
|
candidates_decoded.reserve(candidates->size);
|
13283
13620
|
std::vector<llama_grammar_candidate> candidates_grammar;
|
@@ -13285,9 +13622,10 @@ void llama_sample_grammar(struct llama_context * ctx, llama_token_data_array * c
|
|
13285
13622
|
|
13286
13623
|
for (size_t i = 0; i < candidates->size; ++i) {
|
13287
13624
|
const llama_token id = candidates->data[i].id;
|
13288
|
-
const std::string piece = llama_token_to_piece(ctx, id);
|
13289
|
-
|
13290
|
-
|
13625
|
+
const std::string piece = llama_token_to_piece(ctx, id, false);
|
13626
|
+
|
13627
|
+
if (llama_token_is_eog(&ctx->model, id)) {
|
13628
|
+
if (!allow_eog) {
|
13291
13629
|
candidates->data[i].logit = -INFINITY;
|
13292
13630
|
}
|
13293
13631
|
} else if (piece.empty() || piece[0] == 0) {
|
@@ -13450,7 +13788,7 @@ llama_token llama_sample_token_greedy(struct llama_context * ctx, llama_token_da
|
|
13450
13788
|
return result;
|
13451
13789
|
}
|
13452
13790
|
|
13453
|
-
llama_token
|
13791
|
+
llama_token llama_sample_token_with_rng(struct llama_context * ctx, llama_token_data_array * candidates, std::mt19937 & rng) {
|
13454
13792
|
GGML_ASSERT(ctx);
|
13455
13793
|
|
13456
13794
|
const int64_t t_start_sample_us = ggml_time_us();
|
@@ -13463,7 +13801,6 @@ llama_token llama_sample_token(struct llama_context * ctx, llama_token_data_arra
|
|
13463
13801
|
}
|
13464
13802
|
|
13465
13803
|
std::discrete_distribution<> dist(probs.begin(), probs.end());
|
13466
|
-
auto & rng = ctx->rng;
|
13467
13804
|
int idx = dist(rng);
|
13468
13805
|
|
13469
13806
|
llama_token result = candidates->data[idx].id;
|
@@ -13473,10 +13810,14 @@ llama_token llama_sample_token(struct llama_context * ctx, llama_token_data_arra
|
|
13473
13810
|
return result;
|
13474
13811
|
}
|
13475
13812
|
|
13813
|
+
llama_token llama_sample_token(struct llama_context * ctx, llama_token_data_array * candidates) {
|
13814
|
+
return llama_sample_token_with_rng(ctx, candidates, ctx->rng);
|
13815
|
+
}
|
13816
|
+
|
13476
13817
|
void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar * grammar, llama_token token) {
|
13477
13818
|
const int64_t t_start_sample_us = ggml_time_us();
|
13478
13819
|
|
13479
|
-
if (
|
13820
|
+
if (llama_token_is_eog(&ctx->model, token)) {
|
13480
13821
|
for (const auto & stack : grammar->stacks) {
|
13481
13822
|
if (stack.empty()) {
|
13482
13823
|
return;
|
@@ -13485,7 +13826,7 @@ void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar
|
|
13485
13826
|
GGML_ASSERT(false);
|
13486
13827
|
}
|
13487
13828
|
|
13488
|
-
const std::string piece = llama_token_to_piece(ctx, token);
|
13829
|
+
const std::string piece = llama_token_to_piece(ctx, token, false);
|
13489
13830
|
|
13490
13831
|
// Note terminating 0 in decoded string
|
13491
13832
|
const auto decoded = decode_utf8(piece, grammar->partial_utf8);
|
@@ -14131,14 +14472,20 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
|
|
14131
14472
|
}
|
14132
14473
|
|
14133
14474
|
static size_t llama_tensor_quantize_internal(enum ggml_type new_type, const float * f32_data, void * new_data, const int64_t chunk_size, int64_t nrows, int64_t n_per_row, const float * imatrix, std::vector<std::thread> & workers, const int nthread) {
|
14134
|
-
std::mutex mutex;
|
14135
|
-
int64_t counter = 0;
|
14136
|
-
size_t new_size = 0;
|
14137
14475
|
if (nthread < 2) {
|
14138
14476
|
// single-thread
|
14139
|
-
|
14477
|
+
size_t new_size = ggml_quantize_chunk(new_type, f32_data, new_data, 0, nrows, n_per_row, imatrix);
|
14478
|
+
if (!ggml_validate_row_data(new_type, new_data, new_size)) {
|
14479
|
+
throw std::runtime_error("quantized data validation failed");
|
14480
|
+
}
|
14481
|
+
return new_size;
|
14140
14482
|
}
|
14141
|
-
|
14483
|
+
|
14484
|
+
std::mutex mutex;
|
14485
|
+
int64_t counter = 0;
|
14486
|
+
size_t new_size = 0;
|
14487
|
+
bool valid = true;
|
14488
|
+
auto compute = [&mutex, &counter, &new_size, &valid, new_type, f32_data, new_data, chunk_size,
|
14142
14489
|
nrows, n_per_row, imatrix]() {
|
14143
14490
|
const int64_t nrows_per_chunk = chunk_size / n_per_row;
|
14144
14491
|
size_t local_size = 0;
|
@@ -14153,7 +14500,17 @@ static size_t llama_tensor_quantize_internal(enum ggml_type new_type, const floa
|
|
14153
14500
|
}
|
14154
14501
|
lock.unlock();
|
14155
14502
|
const int64_t this_nrow = std::min(nrows - first_row, nrows_per_chunk);
|
14156
|
-
|
14503
|
+
size_t this_size = ggml_quantize_chunk(new_type, f32_data, new_data, first_row * n_per_row, this_nrow, n_per_row, imatrix);
|
14504
|
+
local_size += this_size;
|
14505
|
+
|
14506
|
+
// validate the quantized data
|
14507
|
+
const size_t row_size = ggml_row_size(new_type, n_per_row);
|
14508
|
+
void * this_data = (char *) new_data + first_row * row_size;
|
14509
|
+
if (!ggml_validate_row_data(new_type, this_data, this_size)) {
|
14510
|
+
std::unique_lock<std::mutex> lock(mutex);
|
14511
|
+
valid = false;
|
14512
|
+
break;
|
14513
|
+
}
|
14157
14514
|
}
|
14158
14515
|
};
|
14159
14516
|
for (int it = 0; it < nthread - 1; ++it) {
|
@@ -14162,6 +14519,9 @@ static size_t llama_tensor_quantize_internal(enum ggml_type new_type, const floa
|
|
14162
14519
|
compute();
|
14163
14520
|
for (auto & w : workers) { w.join(); }
|
14164
14521
|
workers.clear();
|
14522
|
+
if (!valid) {
|
14523
|
+
throw std::runtime_error("quantized data validation failed");
|
14524
|
+
}
|
14165
14525
|
return new_size;
|
14166
14526
|
}
|
14167
14527
|
|
@@ -14224,7 +14584,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
14224
14584
|
auto v = (std::vector<llama_model_kv_override>*)params->kv_overrides;
|
14225
14585
|
kv_overrides = v->data();
|
14226
14586
|
}
|
14227
|
-
llama_model_loader ml(fname_inp, use_mmap, kv_overrides);
|
14587
|
+
llama_model_loader ml(fname_inp, use_mmap, /*check_tensors*/ true, kv_overrides);
|
14228
14588
|
ml.init_mappings(false); // no prefetching
|
14229
14589
|
|
14230
14590
|
llama_model model;
|
@@ -14262,11 +14622,13 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
14262
14622
|
for (auto & o : overrides) {
|
14263
14623
|
if (o.key[0] == 0) break;
|
14264
14624
|
if (o.tag == LLAMA_KV_OVERRIDE_TYPE_FLOAT) {
|
14265
|
-
gguf_set_val_f32(ctx_out, o.key, o.
|
14625
|
+
gguf_set_val_f32(ctx_out, o.key, o.val_f64);
|
14266
14626
|
} else if (o.tag == LLAMA_KV_OVERRIDE_TYPE_INT) {
|
14267
|
-
gguf_set_val_i32(ctx_out, o.key, o.
|
14627
|
+
gguf_set_val_i32(ctx_out, o.key, o.val_i64);
|
14268
14628
|
} else if (o.tag == LLAMA_KV_OVERRIDE_TYPE_BOOL) {
|
14269
|
-
gguf_set_val_bool(ctx_out, o.key, o.
|
14629
|
+
gguf_set_val_bool(ctx_out, o.key, o.val_bool);
|
14630
|
+
} else if (o.tag == LLAMA_KV_OVERRIDE_TYPE_STR) {
|
14631
|
+
gguf_set_val_str(ctx_out, o.key, o.val_str);
|
14270
14632
|
} else {
|
14271
14633
|
LLAMA_LOG_WARN("%s: unknown KV override type for key %s\n", __func__, o.key);
|
14272
14634
|
}
|
@@ -14308,26 +14670,74 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
14308
14670
|
std::vector<no_init<uint8_t>> work;
|
14309
14671
|
std::vector<no_init<float>> f32_conv_buf;
|
14310
14672
|
|
14673
|
+
uint16_t n_split = 1;
|
14674
|
+
// Assume split index is continuous
|
14675
|
+
if (params->keep_split) {
|
14676
|
+
for (int i = 0; i < ml.n_tensors; ++i) {
|
14677
|
+
n_split = std::max(uint16_t(ml.get_weight(i)->idx+1), n_split);
|
14678
|
+
}
|
14679
|
+
}
|
14680
|
+
std::vector<gguf_context*> ctx_outs(n_split, NULL);
|
14681
|
+
ctx_outs[0] = ctx_out;
|
14682
|
+
|
14311
14683
|
// populate the original tensors so we get an initial meta data
|
14312
14684
|
for (int i = 0; i < ml.n_tensors; ++i) {
|
14313
|
-
|
14314
|
-
|
14685
|
+
auto weight = ml.get_weight(i);
|
14686
|
+
uint16_t i_split = params->keep_split ? weight->idx : 0;
|
14687
|
+
struct ggml_tensor * tensor = weight->tensor;
|
14688
|
+
if (ctx_outs[i_split] == NULL) {
|
14689
|
+
ctx_outs[i_split] = gguf_init_empty();
|
14690
|
+
}
|
14691
|
+
gguf_add_tensor(ctx_outs[i_split], tensor);
|
14315
14692
|
}
|
14316
14693
|
|
14317
|
-
|
14318
|
-
|
14319
|
-
|
14320
|
-
|
14694
|
+
// Set split info if needed
|
14695
|
+
if (n_split > 1) {
|
14696
|
+
for (size_t i = 0; i < ctx_outs.size(); ++i) {
|
14697
|
+
gguf_set_val_u16(ctx_outs[i], ml.llm_kv(LLM_KV_SPLIT_NO).c_str(), i);
|
14698
|
+
gguf_set_val_u16(ctx_outs[i], ml.llm_kv(LLM_KV_SPLIT_COUNT).c_str(), n_split);
|
14699
|
+
gguf_set_val_i32(ctx_outs[i], ml.llm_kv(LLM_KV_SPLIT_TENSORS_COUNT).c_str(), ml.n_tensors);
|
14700
|
+
}
|
14701
|
+
}
|
14321
14702
|
|
14322
|
-
|
14703
|
+
int cur_split = -1;
|
14704
|
+
std::ofstream fout;
|
14705
|
+
auto close_ofstream = [&]() {
|
14706
|
+
// Write metadata and close file handler
|
14707
|
+
if (fout.is_open()) {
|
14708
|
+
fout.seekp(0);
|
14709
|
+
std::vector<uint8_t> data(gguf_get_meta_size(ctx_outs[cur_split]));
|
14710
|
+
gguf_get_meta_data(ctx_outs[cur_split], data.data());
|
14711
|
+
fout.write((const char *) data.data(), data.size());
|
14712
|
+
fout.close();
|
14713
|
+
}
|
14714
|
+
};
|
14715
|
+
auto new_ofstream = [&](int index) {
|
14716
|
+
cur_split = index;
|
14717
|
+
GGML_ASSERT(ctx_outs[cur_split] && "Find uninitialized gguf_context");
|
14718
|
+
std::string fname = fname_out;
|
14719
|
+
if (params->keep_split) {
|
14720
|
+
char split_path[PATH_MAX] = {0};
|
14721
|
+
llama_split_path(split_path, sizeof(split_path), fname_out.c_str(), cur_split, n_split);
|
14722
|
+
fname = std::string(split_path);
|
14723
|
+
}
|
14323
14724
|
|
14324
|
-
|
14325
|
-
|
14725
|
+
fout = std::ofstream(fname, std::ios::binary);
|
14726
|
+
fout.exceptions(std::ofstream::failbit); // fail fast on write errors
|
14727
|
+
const size_t meta_size = gguf_get_meta_size(ctx_outs[cur_split]);
|
14728
|
+
// placeholder for the meta data
|
14729
|
+
::zeros(fout, meta_size);
|
14730
|
+
};
|
14326
14731
|
|
14327
14732
|
const auto tn = LLM_TN(model.arch);
|
14328
|
-
|
14733
|
+
new_ofstream(0);
|
14329
14734
|
for (int i = 0; i < ml.n_tensors; ++i) {
|
14330
|
-
|
14735
|
+
auto weight = ml.get_weight(i);
|
14736
|
+
struct ggml_tensor * tensor = weight->tensor;
|
14737
|
+
if (weight->idx != cur_split && params->keep_split) {
|
14738
|
+
close_ofstream();
|
14739
|
+
new_ofstream(weight->idx);
|
14740
|
+
}
|
14331
14741
|
|
14332
14742
|
const std::string name = ggml_get_name(tensor);
|
14333
14743
|
|
@@ -14482,26 +14892,18 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
14482
14892
|
total_size_new += new_size;
|
14483
14893
|
|
14484
14894
|
// update the gguf meta data as we go
|
14485
|
-
gguf_set_tensor_type(
|
14486
|
-
gguf_set_tensor_data(
|
14895
|
+
gguf_set_tensor_type(ctx_outs[cur_split], name.c_str(), new_type);
|
14896
|
+
gguf_set_tensor_data(ctx_outs[cur_split], name.c_str(), new_data, new_size);
|
14487
14897
|
|
14488
14898
|
// write tensor data + padding
|
14489
14899
|
fout.write((const char *) new_data, new_size);
|
14490
14900
|
zeros(fout, GGML_PAD(new_size, align) - new_size);
|
14491
14901
|
}
|
14492
|
-
|
14493
|
-
|
14494
|
-
|
14495
|
-
fout.seekp(0);
|
14496
|
-
std::vector<uint8_t> data(gguf_get_meta_size(ctx_out));
|
14497
|
-
gguf_get_meta_data(ctx_out, data.data());
|
14498
|
-
fout.write((const char *) data.data(), data.size());
|
14902
|
+
close_ofstream();
|
14903
|
+
for (auto & c:ctx_outs) {
|
14904
|
+
gguf_free(c);
|
14499
14905
|
}
|
14500
14906
|
|
14501
|
-
fout.close();
|
14502
|
-
|
14503
|
-
gguf_free(ctx_out);
|
14504
|
-
|
14505
14907
|
LLAMA_LOG_INFO("%s: model size = %8.2f MB\n", __func__, total_size_org/1024.0/1024.0);
|
14506
14908
|
LLAMA_LOG_INFO("%s: quant size = %8.2f MB\n", __func__, total_size_new/1024.0/1024.0);
|
14507
14909
|
|
@@ -14545,7 +14947,7 @@ static int llama_apply_lora_from_file_internal(
|
|
14545
14947
|
std::unique_ptr<llama_model_loader> ml;
|
14546
14948
|
if (path_base_model) {
|
14547
14949
|
LLAMA_LOG_INFO("%s: loading base model from '%s'\n", __func__, path_base_model);
|
14548
|
-
ml.reset(new llama_model_loader(path_base_model, /*use_mmap*/ true, /*kv_overrides*/ nullptr));
|
14950
|
+
ml.reset(new llama_model_loader(path_base_model, /*use_mmap*/ true, /*check_tensors*/ false, /*kv_overrides*/ nullptr));
|
14549
14951
|
ml->init_mappings(/*prefetch*/ false); // no prefetching
|
14550
14952
|
}
|
14551
14953
|
|
@@ -14804,6 +15206,7 @@ struct llama_model_params llama_model_default_params() {
|
|
14804
15206
|
/*.vocab_only =*/ false,
|
14805
15207
|
/*.use_mmap =*/ true,
|
14806
15208
|
/*.use_mlock =*/ false,
|
15209
|
+
/*.check_tensors =*/ false,
|
14807
15210
|
};
|
14808
15211
|
|
14809
15212
|
#ifdef GGML_USE_METAL
|
@@ -14840,6 +15243,7 @@ struct llama_context_params llama_context_default_params() {
|
|
14840
15243
|
/*.logits_all =*/ false,
|
14841
15244
|
/*.embeddings =*/ false,
|
14842
15245
|
/*.offload_kqv =*/ true,
|
15246
|
+
/*.flash_attn =*/ false,
|
14843
15247
|
/*.abort_callback =*/ nullptr,
|
14844
15248
|
/*.abort_callback_data =*/ nullptr,
|
14845
15249
|
};
|
@@ -14857,6 +15261,7 @@ struct llama_model_quantize_params llama_model_quantize_default_params() {
|
|
14857
15261
|
/*.quantize_output_tensor =*/ true,
|
14858
15262
|
/*.only_copy =*/ false,
|
14859
15263
|
/*.pure =*/ false,
|
15264
|
+
/*.keep_split =*/ false,
|
14860
15265
|
/*.imatrix =*/ nullptr,
|
14861
15266
|
/*.kv_overrides =*/ nullptr,
|
14862
15267
|
};
|
@@ -15005,6 +15410,7 @@ struct llama_context * llama_new_context_with_model(
|
|
15005
15410
|
cparams.defrag_thold = params.defrag_thold;
|
15006
15411
|
cparams.embeddings = params.embeddings;
|
15007
15412
|
cparams.offload_kqv = params.offload_kqv;
|
15413
|
+
cparams.flash_attn = params.flash_attn;
|
15008
15414
|
cparams.pooling_type = params.pooling_type;
|
15009
15415
|
|
15010
15416
|
cparams.n_ctx = params.n_ctx == 0 ? hparams.n_ctx_train : params.n_ctx;
|
@@ -15012,12 +15418,20 @@ struct llama_context * llama_new_context_with_model(
|
|
15012
15418
|
cparams.rope_freq_scale = params.rope_freq_scale == 0.0f ? hparams.rope_freq_scale_train : params.rope_freq_scale;
|
15013
15419
|
|
15014
15420
|
// this is necessary due to kv_self.n being padded later during inference
|
15015
|
-
cparams.n_ctx
|
15421
|
+
cparams.n_ctx = GGML_PAD(cparams.n_ctx, 256);
|
15016
15422
|
|
15017
15423
|
// with causal attention, the batch size is limited by the context size
|
15018
15424
|
cparams.n_batch = hparams.causal_attn ? std::min(cparams.n_ctx, params.n_batch) : params.n_batch;
|
15019
|
-
cparams.n_ubatch = std::min(cparams.n_batch, params.n_ubatch == 0 ? params.n_batch : params.n_ubatch);
|
15020
15425
|
|
15426
|
+
// the batch has to be at least GGML_KQ_MASK_PAD because we will be padding the KQ_mask
|
15427
|
+
// this is required by GPU kernels in order to avoid out-of-bounds accesses (e.g. ggml_flash_attn_ext)
|
15428
|
+
// ref: https://github.com/ggerganov/llama.cpp/pull/5021
|
15429
|
+
if (cparams.n_batch < GGML_KQ_MASK_PAD) {
|
15430
|
+
LLAMA_LOG_WARN("%s: n_batch is less than GGML_KQ_MASK_PAD - increasing to %d\n", __func__, GGML_KQ_MASK_PAD);
|
15431
|
+
cparams.n_batch = GGML_KQ_MASK_PAD;
|
15432
|
+
}
|
15433
|
+
|
15434
|
+
cparams.n_ubatch = std::min(cparams.n_batch, params.n_ubatch == 0 ? params.n_batch : params.n_ubatch);
|
15021
15435
|
|
15022
15436
|
cparams.n_yarn_orig_ctx = params.yarn_orig_ctx != 0 ? params.yarn_orig_ctx :
|
15023
15437
|
hparams.n_yarn_orig_ctx != 0 ? hparams.n_yarn_orig_ctx :
|
@@ -15049,6 +15463,23 @@ struct llama_context * llama_new_context_with_model(
|
|
15049
15463
|
}
|
15050
15464
|
}
|
15051
15465
|
|
15466
|
+
if (cparams.flash_attn && hparams.use_alibi) {
|
15467
|
+
LLAMA_LOG_WARN("%s: flash_attn is not yet compatible with ALiBi - forcing off\n", __func__);
|
15468
|
+
cparams.flash_attn = false;
|
15469
|
+
}
|
15470
|
+
|
15471
|
+
if (cparams.flash_attn && model->arch == LLM_ARCH_GROK) {
|
15472
|
+
LLAMA_LOG_WARN("%s: flash_attn is not compatible with Grok - forcing off\n", __func__);
|
15473
|
+
cparams.flash_attn = false;
|
15474
|
+
}
|
15475
|
+
|
15476
|
+
#ifdef GGML_USE_HIPBLAS
|
15477
|
+
if (cparams.flash_attn) {
|
15478
|
+
LLAMA_LOG_WARN("%s: flash_attn is not yet compatible with HIPBLAS builds - forcing off\n", __func__);
|
15479
|
+
cparams.flash_attn = false;
|
15480
|
+
}
|
15481
|
+
#endif
|
15482
|
+
|
15052
15483
|
if (params.seed == LLAMA_DEFAULT_SEED) {
|
15053
15484
|
params.seed = time(NULL);
|
15054
15485
|
}
|
@@ -15056,6 +15487,7 @@ struct llama_context * llama_new_context_with_model(
|
|
15056
15487
|
LLAMA_LOG_INFO("%s: n_ctx = %u\n", __func__, cparams.n_ctx);
|
15057
15488
|
LLAMA_LOG_INFO("%s: n_batch = %u\n", __func__, cparams.n_batch);
|
15058
15489
|
LLAMA_LOG_INFO("%s: n_ubatch = %u\n", __func__, cparams.n_ubatch);
|
15490
|
+
LLAMA_LOG_INFO("%s: flash_attn = %d\n", __func__, cparams.flash_attn);
|
15059
15491
|
LLAMA_LOG_INFO("%s: freq_base = %.1f\n", __func__, cparams.rope_freq_base);
|
15060
15492
|
LLAMA_LOG_INFO("%s: freq_scale = %g\n", __func__, cparams.rope_freq_scale);
|
15061
15493
|
|
@@ -15184,7 +15616,7 @@ struct llama_context * llama_new_context_with_model(
|
|
15184
15616
|
}
|
15185
15617
|
ctx->backends.push_back(ctx->backend_cpu);
|
15186
15618
|
|
15187
|
-
if (!llama_kv_cache_init(ctx->kv_self, ctx
|
15619
|
+
if (!llama_kv_cache_init(ctx->kv_self, ctx, type_k, type_v, kv_size, cparams.offload_kqv)) {
|
15188
15620
|
LLAMA_LOG_ERROR("%s: llama_kv_cache_init() failed for self-attention cache\n", __func__);
|
15189
15621
|
llama_free(ctx);
|
15190
15622
|
return nullptr;
|
@@ -15365,6 +15797,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
|
|
15365
15797
|
case LLM_ARCH_QWEN2:
|
15366
15798
|
case LLM_ARCH_QWEN2MOE:
|
15367
15799
|
case LLM_ARCH_PHI2:
|
15800
|
+
case LLM_ARCH_PHI3:
|
15368
15801
|
case LLM_ARCH_GEMMA:
|
15369
15802
|
case LLM_ARCH_STARCODER2:
|
15370
15803
|
return LLAMA_ROPE_TYPE_NEOX;
|
@@ -15378,6 +15811,10 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
|
|
15378
15811
|
return LLAMA_ROPE_TYPE_NONE;
|
15379
15812
|
}
|
15380
15813
|
|
15814
|
+
enum llama_pooling_type llama_pooling_type(const struct llama_context * ctx) {
|
15815
|
+
return ctx->cparams.pooling_type;
|
15816
|
+
}
|
15817
|
+
|
15381
15818
|
int32_t llama_n_vocab(const struct llama_model * model) {
|
15382
15819
|
return model->hparams.n_vocab;
|
15383
15820
|
}
|
@@ -15778,6 +16215,7 @@ size_t llama_state_get_size(const struct llama_context * ctx) {
|
|
15778
16215
|
const size_t s_kv_head = sizeof(uint32_t);
|
15779
16216
|
const size_t s_kv_size = sizeof(uint32_t);
|
15780
16217
|
const size_t s_kv_used = sizeof(uint32_t);
|
16218
|
+
const size_t s_v_trans = sizeof(uint32_t);
|
15781
16219
|
const size_t s_kv = ctx->kv_self.total_size();
|
15782
16220
|
const size_t s_kv_cell = sizeof(llama_pos) + sizeof(size_t) + cparams.n_seq_max*sizeof(llama_seq_id);
|
15783
16221
|
const size_t s_kv_cells = ctx->kv_self.size * s_kv_cell;
|
@@ -15795,10 +16233,14 @@ size_t llama_state_get_size(const struct llama_context * ctx) {
|
|
15795
16233
|
+ s_kv_head
|
15796
16234
|
+ s_kv_size
|
15797
16235
|
+ s_kv_used
|
16236
|
+
+ s_v_trans
|
15798
16237
|
+ s_kv
|
15799
16238
|
+ s_kv_cells
|
15800
16239
|
);
|
15801
16240
|
|
16241
|
+
// on session change it is very likely that the state size has changed - so we need to update this function
|
16242
|
+
static_assert(LLAMA_SESSION_VERSION == 6, "So you just bumped the session version - good. But did you remember to update llama_state_get_size?");
|
16243
|
+
|
15802
16244
|
return s_total;
|
15803
16245
|
}
|
15804
16246
|
|
@@ -15856,6 +16298,8 @@ struct llama_data_file_context : llama_data_context {
|
|
15856
16298
|
*
|
15857
16299
|
*/
|
15858
16300
|
static void llama_state_get_data_internal(struct llama_context * ctx, llama_data_context * data_ctx) {
|
16301
|
+
llama_synchronize(ctx);
|
16302
|
+
|
15859
16303
|
// copy rng
|
15860
16304
|
{
|
15861
16305
|
std::ostringstream rng_ss;
|
@@ -15942,11 +16386,13 @@ static void llama_state_get_data_internal(struct llama_context * ctx, llama_data
|
|
15942
16386
|
const uint32_t kv_size = kv_self.size;
|
15943
16387
|
const size_t kv_buf_size = kv_self.total_size() / (kv_size ? kv_size : 1) * kv_head;
|
15944
16388
|
const uint32_t kv_used = kv_self.used;
|
16389
|
+
const uint32_t v_trans = kv_self.v_trans ? 1 : 0;
|
15945
16390
|
|
15946
16391
|
data_ctx->write(&kv_buf_size, sizeof(kv_buf_size));
|
15947
16392
|
data_ctx->write(&kv_head, sizeof(kv_head));
|
15948
16393
|
data_ctx->write(&kv_size, sizeof(kv_size));
|
15949
16394
|
data_ctx->write(&kv_used, sizeof(kv_used));
|
16395
|
+
data_ctx->write(&v_trans, sizeof(v_trans));
|
15950
16396
|
|
15951
16397
|
if (kv_buf_size) {
|
15952
16398
|
const size_t pre_kv_buf_size = data_ctx->get_size_written();
|
@@ -15959,7 +16405,7 @@ static void llama_state_get_data_internal(struct llama_context * ctx, llama_data
|
|
15959
16405
|
ggml_backend_tensor_get(kv_self.k_l[il], tmp_buf.data(), 0, tmp_buf.size());
|
15960
16406
|
data_ctx->write(tmp_buf.data(), tmp_buf.size());
|
15961
16407
|
|
15962
|
-
if (kv_self.recurrent) {
|
16408
|
+
if (kv_self.recurrent || !kv_self.v_trans) {
|
15963
16409
|
// v is contiguous for recurrent models
|
15964
16410
|
// TODO: use other tensors for state models than k and v
|
15965
16411
|
const size_t v_size = ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa*kv_head);
|
@@ -16008,6 +16454,8 @@ size_t llama_state_get_data(struct llama_context * ctx, uint8_t * dst) {
|
|
16008
16454
|
|
16009
16455
|
// Sets the state reading from the specified source address
|
16010
16456
|
size_t llama_state_set_data(struct llama_context * ctx, const uint8_t * src) {
|
16457
|
+
llama_synchronize(ctx);
|
16458
|
+
|
16011
16459
|
const uint8_t * inp = src;
|
16012
16460
|
|
16013
16461
|
// set rng
|
@@ -16090,11 +16538,15 @@ size_t llama_state_set_data(struct llama_context * ctx, const uint8_t * src) {
|
|
16090
16538
|
uint32_t kv_head;
|
16091
16539
|
uint32_t kv_size;
|
16092
16540
|
uint32_t kv_used;
|
16541
|
+
uint32_t v_trans;
|
16093
16542
|
|
16094
16543
|
memcpy(&kv_buf_size, inp, sizeof(kv_buf_size)); inp += sizeof(kv_buf_size);
|
16095
16544
|
memcpy(&kv_head, inp, sizeof(kv_head)); inp += sizeof(kv_head);
|
16096
16545
|
memcpy(&kv_size, inp, sizeof(kv_size)); inp += sizeof(kv_size);
|
16097
16546
|
memcpy(&kv_used, inp, sizeof(kv_used)); inp += sizeof(kv_used);
|
16547
|
+
memcpy(&v_trans, inp, sizeof(v_trans)); inp += sizeof(v_trans);
|
16548
|
+
|
16549
|
+
GGML_ASSERT(kv_self.v_trans == (bool) v_trans); // incompatible V transposition
|
16098
16550
|
|
16099
16551
|
if (kv_self.size != kv_size) {
|
16100
16552
|
// the KV cache needs to be big enough to load all the KV cells from the saved state
|
@@ -16104,6 +16556,8 @@ size_t llama_state_set_data(struct llama_context * ctx, const uint8_t * src) {
|
|
16104
16556
|
__func__, kv_head, kv_size, kv_self.size);
|
16105
16557
|
}
|
16106
16558
|
|
16559
|
+
llama_kv_cache_clear(ctx);
|
16560
|
+
|
16107
16561
|
if (kv_buf_size) {
|
16108
16562
|
const size_t pre_kv_buf_size = inp - src;
|
16109
16563
|
|
@@ -16115,7 +16569,7 @@ size_t llama_state_set_data(struct llama_context * ctx, const uint8_t * src) {
|
|
16115
16569
|
ggml_backend_tensor_set(kv_self.k_l[il], inp, 0, k_size);
|
16116
16570
|
inp += k_size;
|
16117
16571
|
|
16118
|
-
if (kv_self.recurrent) {
|
16572
|
+
if (kv_self.recurrent || !kv_self.v_trans) {
|
16119
16573
|
// v is contiguous for recurrent models
|
16120
16574
|
// TODO: use other tensors for state models than k and v
|
16121
16575
|
const size_t v_size = ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa*kv_head);
|
@@ -16137,8 +16591,6 @@ size_t llama_state_set_data(struct llama_context * ctx, const uint8_t * src) {
|
|
16137
16591
|
GGML_ASSERT(kv_buf_size == inp - src - pre_kv_buf_size);
|
16138
16592
|
}
|
16139
16593
|
|
16140
|
-
llama_kv_cache_clear(ctx);
|
16141
|
-
|
16142
16594
|
ctx->kv_self.head = kv_head;
|
16143
16595
|
ctx->kv_self.used = kv_used;
|
16144
16596
|
|
@@ -16312,6 +16764,8 @@ size_t llama_state_seq_get_size(struct llama_context* ctx, llama_seq_id seq_id)
|
|
16312
16764
|
}
|
16313
16765
|
|
16314
16766
|
static size_t llama_state_seq_get_data_internal(struct llama_context * ctx, llama_data_context & data_ctx, llama_seq_id seq_id) {
|
16767
|
+
llama_synchronize(ctx);
|
16768
|
+
|
16315
16769
|
const auto & kv_self = ctx->kv_self;
|
16316
16770
|
GGML_ASSERT(!kv_self.recurrent); // not implemented
|
16317
16771
|
|
@@ -16396,28 +16850,49 @@ static size_t llama_state_seq_get_data_internal(struct llama_context * ctx, llam
|
|
16396
16850
|
}
|
16397
16851
|
}
|
16398
16852
|
|
16399
|
-
//
|
16400
|
-
|
16401
|
-
|
16402
|
-
|
16403
|
-
|
16404
|
-
|
16853
|
+
// TODO: simplify, reduce copy-paste
|
16854
|
+
if (!kv_self.v_trans) {
|
16855
|
+
for (int il = 0; il < (int)n_layer; ++il) {
|
16856
|
+
// Write value type
|
16857
|
+
const int32_t v_type_i = (int32_t)kv_self.v_l[il]->type;
|
16858
|
+
data_ctx.write(&v_type_i, sizeof(v_type_i));
|
16405
16859
|
|
16406
|
-
|
16407
|
-
|
16408
|
-
|
16860
|
+
// Write row size of value
|
16861
|
+
const size_t v_size_row = ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa);
|
16862
|
+
data_ctx.write(&v_size_row, sizeof(v_size_row));
|
16409
16863
|
|
16410
|
-
|
16411
|
-
for (uint32_t j = 0; j < n_embd_v_gqa; ++j) {
|
16412
|
-
// Read each range of cells of v_size_el length each into tmp_buf and write out
|
16864
|
+
// Read each range of cells of v_size length each into tmp_buf and write out
|
16413
16865
|
for (const auto & range : cell_ranges) {
|
16414
16866
|
const size_t range_size = range.second - range.first;
|
16415
|
-
|
16416
|
-
tmp_buf.
|
16417
|
-
ggml_backend_tensor_get(kv_self.v_l[il], tmp_buf.data(), src_offset, tmp_buf.size());
|
16867
|
+
tmp_buf.resize(range_size * v_size_row);
|
16868
|
+
ggml_backend_tensor_get(kv_self.v_l[il], tmp_buf.data(), range.first * v_size_row, range_size * v_size_row);
|
16418
16869
|
data_ctx.write(tmp_buf.data(), tmp_buf.size());
|
16419
16870
|
}
|
16420
16871
|
}
|
16872
|
+
} else {
|
16873
|
+
// For the values, they are transposed, so we also need the element size and get the element ranges from each row
|
16874
|
+
const uint32_t kv_size = kv_self.size;
|
16875
|
+
for (int il = 0; il < (int)n_layer; ++il) {
|
16876
|
+
// Write value type
|
16877
|
+
const int32_t v_type_i = (int32_t)kv_self.v_l[il]->type;
|
16878
|
+
data_ctx.write(&v_type_i, sizeof(v_type_i));
|
16879
|
+
|
16880
|
+
// Write element size
|
16881
|
+
const size_t v_size_el = ggml_type_size(kv_self.v_l[il]->type);
|
16882
|
+
data_ctx.write(&v_size_el, sizeof(v_size_el));
|
16883
|
+
|
16884
|
+
// For each row, we get the element values of each cell
|
16885
|
+
for (uint32_t j = 0; j < n_embd_v_gqa; ++j) {
|
16886
|
+
// Read each range of cells of v_size_el length each into tmp_buf and write out
|
16887
|
+
for (const auto & range : cell_ranges) {
|
16888
|
+
const size_t range_size = range.second - range.first;
|
16889
|
+
const size_t src_offset = (range.first + j * kv_size) * v_size_el;
|
16890
|
+
tmp_buf.resize(range_size * v_size_el);
|
16891
|
+
ggml_backend_tensor_get(kv_self.v_l[il], tmp_buf.data(), src_offset, tmp_buf.size());
|
16892
|
+
data_ctx.write(tmp_buf.data(), tmp_buf.size());
|
16893
|
+
}
|
16894
|
+
}
|
16895
|
+
}
|
16421
16896
|
}
|
16422
16897
|
|
16423
16898
|
return data_ctx.get_size_written();
|
@@ -16429,6 +16904,8 @@ size_t llama_state_seq_get_data(struct llama_context* ctx, uint8_t* dst, llama_s
|
|
16429
16904
|
}
|
16430
16905
|
|
16431
16906
|
size_t llama_state_seq_set_data(struct llama_context * ctx, const uint8_t * src, llama_seq_id dest_seq_id) {
|
16907
|
+
llama_synchronize(ctx);
|
16908
|
+
|
16432
16909
|
auto & kv_self = ctx->kv_self;
|
16433
16910
|
GGML_ASSERT(!kv_self.recurrent); // not implemented
|
16434
16911
|
|
@@ -16540,41 +17017,75 @@ size_t llama_state_seq_set_data(struct llama_context * ctx, const uint8_t * src,
|
|
16540
17017
|
}
|
16541
17018
|
}
|
16542
17019
|
|
16543
|
-
//
|
16544
|
-
|
16545
|
-
|
16546
|
-
|
16547
|
-
|
16548
|
-
|
16549
|
-
|
16550
|
-
|
16551
|
-
|
16552
|
-
|
16553
|
-
|
16554
|
-
|
17020
|
+
// TODO: simplify, reduce copy-paste
|
17021
|
+
if (!kv_self.v_trans) {
|
17022
|
+
for (int il = 0; il < (int)n_layer; ++il) {
|
17023
|
+
// Read type of value
|
17024
|
+
int32_t v_type_i_ref;
|
17025
|
+
memcpy(&v_type_i_ref, inp, sizeof(v_type_i_ref));
|
17026
|
+
inp += sizeof(v_type_i_ref);
|
17027
|
+
const int32_t v_type_i = (int32_t)kv_self.v_l[il]->type;
|
17028
|
+
if (v_type_i != v_type_i_ref) {
|
17029
|
+
llama_kv_cache_seq_rm(kv_self, dest_seq_id, -1, -1);
|
17030
|
+
LLAMA_LOG_ERROR("%s: mismatched value type (%d != %d, layer %d)\n", __func__, v_type_i, v_type_i_ref, il);
|
17031
|
+
return 0;
|
17032
|
+
}
|
16555
17033
|
|
16556
|
-
|
16557
|
-
|
16558
|
-
|
16559
|
-
|
16560
|
-
|
16561
|
-
|
16562
|
-
|
16563
|
-
|
16564
|
-
|
16565
|
-
|
17034
|
+
// Read row size of value
|
17035
|
+
size_t v_size_row_ref;
|
17036
|
+
memcpy(&v_size_row_ref, inp, sizeof(v_size_row_ref));
|
17037
|
+
inp += sizeof(v_size_row_ref);
|
17038
|
+
const size_t v_size_row = ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa);
|
17039
|
+
if (v_size_row != v_size_row_ref) {
|
17040
|
+
llama_kv_cache_seq_rm(kv_self, dest_seq_id, -1, -1);
|
17041
|
+
LLAMA_LOG_ERROR("%s: mismatched value row size (%zu != %zu, layer %d)\n", __func__, v_size_row, v_size_row_ref, il);
|
17042
|
+
return 0;
|
17043
|
+
}
|
16566
17044
|
|
16567
|
-
|
16568
|
-
|
16569
|
-
|
16570
|
-
|
16571
|
-
|
16572
|
-
|
17045
|
+
if (cell_count) {
|
17046
|
+
// Read and set the values for the whole cell range
|
17047
|
+
ggml_backend_tensor_set(kv_self.v_l[il], inp, kv_head * v_size_row, cell_count * v_size_row);
|
17048
|
+
inp += cell_count * v_size_row;
|
17049
|
+
}
|
17050
|
+
}
|
17051
|
+
} else {
|
17052
|
+
// For each layer, read the values for each cell (transposed)
|
17053
|
+
for (int il = 0; il < (int)n_layer; ++il) {
|
17054
|
+
// Read type of value
|
17055
|
+
int32_t v_type_i_ref;
|
17056
|
+
memcpy(&v_type_i_ref, inp, sizeof(v_type_i_ref));
|
17057
|
+
inp += sizeof(v_type_i_ref);
|
17058
|
+
const int32_t v_type_i = (int32_t)kv_self.v_l[il]->type;
|
17059
|
+
if (v_type_i != v_type_i_ref) {
|
17060
|
+
llama_kv_cache_seq_rm(kv_self, dest_seq_id, -1, -1);
|
17061
|
+
LLAMA_LOG_ERROR("%s: mismatched value type (%d != %d, layer %d)\n", __func__, v_type_i, v_type_i_ref, il);
|
17062
|
+
return 0;
|
17063
|
+
}
|
17064
|
+
|
17065
|
+
// Read element size of value
|
17066
|
+
size_t v_size_el_ref;
|
17067
|
+
memcpy(&v_size_el_ref, inp, sizeof(v_size_el_ref));
|
17068
|
+
inp += sizeof(v_size_el_ref);
|
17069
|
+
const size_t v_size_el = ggml_type_size(kv_self.v_l[il]->type);
|
17070
|
+
if (v_size_el != v_size_el_ref) {
|
17071
|
+
llama_kv_cache_seq_rm(kv_self, dest_seq_id, -1, -1);
|
17072
|
+
LLAMA_LOG_ERROR("%s: mismatched value element size (%zu != %zu, layer %d)\n", __func__, v_size_el, v_size_el_ref, il);
|
17073
|
+
return 0;
|
17074
|
+
}
|
17075
|
+
|
17076
|
+
if (cell_count) {
|
17077
|
+
// For each row in the transposed matrix, read the values for the whole cell range
|
17078
|
+
for (uint32_t j = 0; j < n_embd_v_gqa; ++j) {
|
17079
|
+
const size_t dst_offset = (kv_head + j * kv_size) * v_size_el;
|
17080
|
+
ggml_backend_tensor_set(kv_self.v_l[il], inp, dst_offset, cell_count * v_size_el);
|
17081
|
+
inp += cell_count * v_size_el;
|
17082
|
+
}
|
16573
17083
|
}
|
16574
17084
|
}
|
16575
17085
|
}
|
16576
17086
|
|
16577
17087
|
const size_t nread = inp - src;
|
17088
|
+
|
16578
17089
|
return nread;
|
16579
17090
|
}
|
16580
17091
|
|
@@ -16880,6 +17391,13 @@ llama_token_type llama_token_get_type(const struct llama_model * model, llama_to
|
|
16880
17391
|
return model->vocab.id_to_token[token].type;
|
16881
17392
|
}
|
16882
17393
|
|
17394
|
+
bool llama_token_is_eog(const struct llama_model * model, llama_token token) {
|
17395
|
+
return token != -1 && (
|
17396
|
+
token == llama_token_eos(model) ||
|
17397
|
+
token == llama_token_eot(model)
|
17398
|
+
);
|
17399
|
+
}
|
17400
|
+
|
16883
17401
|
llama_token llama_token_bos(const struct llama_model * model) {
|
16884
17402
|
return model->vocab.special_bos_id;
|
16885
17403
|
}
|
@@ -16957,7 +17475,7 @@ static std::string llama_decode_text(const std::string & text) {
|
|
16957
17475
|
}
|
16958
17476
|
|
16959
17477
|
// does not write null-terminator to buf
|
16960
|
-
int32_t llama_token_to_piece(const struct llama_model * model, llama_token token, char * buf, int32_t length) {
|
17478
|
+
int32_t llama_token_to_piece(const struct llama_model * model, llama_token token, char * buf, int32_t length, bool special) {
|
16961
17479
|
if (0 <= token && token < llama_n_vocab(model)) {
|
16962
17480
|
switch (llama_vocab_get_type(model->vocab)) {
|
16963
17481
|
case LLAMA_VOCAB_TYPE_WPM:
|
@@ -16972,7 +17490,9 @@ int32_t llama_token_to_piece(const struct llama_model * model, llama_token token
|
|
16972
17490
|
}
|
16973
17491
|
memcpy(buf, result.c_str(), result.length());
|
16974
17492
|
return result.length();
|
16975
|
-
} else if (
|
17493
|
+
} else if (
|
17494
|
+
(llama_is_user_defined_token(model->vocab, token)) ||
|
17495
|
+
(llama_is_control_token (model->vocab, token) && special)) {
|
16976
17496
|
std::string result = model->vocab.id_to_token[token].text;
|
16977
17497
|
if (length < (int) result.length()) {
|
16978
17498
|
return -(int) result.length();
|
@@ -16985,8 +17505,6 @@ int32_t llama_token_to_piece(const struct llama_model * model, llama_token token
|
|
16985
17505
|
}
|
16986
17506
|
memcpy(buf, "\xe2\x96\x85", 3);
|
16987
17507
|
return 3;
|
16988
|
-
} else if (llama_is_control_token(model->vocab, token)) {
|
16989
|
-
;
|
16990
17508
|
} else if (llama_is_byte_token(model->vocab, token)) {
|
16991
17509
|
if (length < 1) {
|
16992
17510
|
return -1;
|
@@ -17007,15 +17525,15 @@ int32_t llama_token_to_piece(const struct llama_model * model, llama_token token
|
|
17007
17525
|
}
|
17008
17526
|
memcpy(buf, result.c_str(), result.length());
|
17009
17527
|
return result.length();
|
17010
|
-
} else if (
|
17528
|
+
} else if (
|
17529
|
+
(llama_is_user_defined_token(model->vocab, token)) ||
|
17530
|
+
(llama_is_control_token (model->vocab, token) && special)) {
|
17011
17531
|
std::string result = model->vocab.id_to_token[token].text;
|
17012
17532
|
if (length < (int) result.length()) {
|
17013
17533
|
return -(int) result.length();
|
17014
17534
|
}
|
17015
17535
|
memcpy(buf, result.c_str(), result.length());
|
17016
17536
|
return result.length();
|
17017
|
-
} else if (llama_is_control_token(model->vocab, token)) {
|
17018
|
-
;
|
17019
17537
|
}
|
17020
17538
|
break;
|
17021
17539
|
}
|
@@ -17213,6 +17731,24 @@ static int32_t llama_chat_apply_template_internal(
|
|
17213
17731
|
if (add_ass) {
|
17214
17732
|
ss << "<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>";
|
17215
17733
|
}
|
17734
|
+
} else if (tmpl == "llama3" || (tmpl.find("<|start_header_id|>") != std::string::npos && tmpl.find("<|end_header_id|>") != std::string::npos)) {
|
17735
|
+
// Llama 3
|
17736
|
+
for (auto message : chat) {
|
17737
|
+
std::string role(message->role);
|
17738
|
+
ss << "<|start_header_id|>" << role << "<|end_header_id|>\n\n" << trim(message->content) << "<|eot_id|>";
|
17739
|
+
}
|
17740
|
+
if (add_ass) {
|
17741
|
+
ss << "<|start_header_id|>assistant<|end_header_id|>\n\n";
|
17742
|
+
}
|
17743
|
+
} else if (tmpl == "phi3" || (tmpl.find("<|assistant|>") != std::string::npos && tmpl.find("<|end|>") != std::string::npos )) {
|
17744
|
+
// Phi 3
|
17745
|
+
for (auto message : chat) {
|
17746
|
+
std::string role(message->role);
|
17747
|
+
ss << "<|" << role << "|>\n" << trim(message->content) << "<|end|>\n";
|
17748
|
+
}
|
17749
|
+
if (add_ass) {
|
17750
|
+
ss << "<|assistant|>\n";
|
17751
|
+
}
|
17216
17752
|
} else {
|
17217
17753
|
// template not supported
|
17218
17754
|
return -1;
|
@@ -17345,6 +17881,11 @@ const char * llama_print_system_info(void) {
|
|
17345
17881
|
s += "SSSE3 = " + std::to_string(ggml_cpu_has_ssse3()) + " | ";
|
17346
17882
|
s += "VSX = " + std::to_string(ggml_cpu_has_vsx()) + " | ";
|
17347
17883
|
s += "MATMUL_INT8 = " + std::to_string(ggml_cpu_has_matmul_int8()) + " | ";
|
17884
|
+
#ifdef GGML_USE_LLAMAFILE
|
17885
|
+
s += "LLAMAFILE = 1 | ";
|
17886
|
+
#else
|
17887
|
+
s += "LLAMAFILE = 0 | ";
|
17888
|
+
#endif
|
17348
17889
|
|
17349
17890
|
return s.c_str();
|
17350
17891
|
}
|