llama_cpp 0.14.6 → 0.15.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +22 -0
- data/README.md +2 -2
- data/ext/llama_cpp/extconf.rb +2 -1
- data/ext/llama_cpp/llama_cpp.cpp +90 -11
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +22 -3
- data/vendor/tmp/llama.cpp/Makefile +52 -22
- data/vendor/tmp/llama.cpp/ggml-alloc.c +8 -8
- data/vendor/tmp/llama.cpp/ggml-backend.c +21 -15
- data/vendor/tmp/llama.cpp/ggml-cuda.cu +6 -0
- data/vendor/tmp/llama.cpp/ggml-impl.h +262 -4
- data/vendor/tmp/llama.cpp/ggml-kompute.cpp +7 -0
- data/vendor/tmp/llama.cpp/ggml-metal.m +376 -176
- data/vendor/tmp/llama.cpp/ggml-metal.metal +654 -18
- data/vendor/tmp/llama.cpp/ggml-quants.c +284 -293
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +17 -7
- data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +5 -0
- data/vendor/tmp/llama.cpp/ggml.c +394 -44
- data/vendor/tmp/llama.cpp/ggml.h +22 -0
- data/vendor/tmp/llama.cpp/llama.cpp +996 -455
- data/vendor/tmp/llama.cpp/llama.h +46 -15
- data/vendor/tmp/llama.cpp/sgemm.cpp +437 -590
- data/vendor/tmp/llama.cpp/sgemm.h +4 -2
- data/vendor/tmp/llama.cpp/unicode-data.cpp +1 -1
- data/vendor/tmp/llama.cpp/unicode-data.h +2 -2
- data/vendor/tmp/llama.cpp/unicode.cpp +448 -39
- data/vendor/tmp/llama.cpp/unicode.h +2 -1
- metadata +3 -3
@@ -75,6 +75,7 @@
|
|
75
75
|
#include <forward_list>
|
76
76
|
#include <fstream>
|
77
77
|
#include <functional>
|
78
|
+
#include <future>
|
78
79
|
#include <initializer_list>
|
79
80
|
#include <locale>
|
80
81
|
#include <map>
|
@@ -107,7 +108,6 @@
|
|
107
108
|
#define LLAMA_MAX_NODES 8192
|
108
109
|
#define LLAMA_MAX_EXPERTS 60
|
109
110
|
|
110
|
-
|
111
111
|
//
|
112
112
|
// logging
|
113
113
|
//
|
@@ -211,6 +211,7 @@ enum llm_arch {
|
|
211
211
|
LLM_ARCH_QWEN2,
|
212
212
|
LLM_ARCH_QWEN2MOE,
|
213
213
|
LLM_ARCH_PHI2,
|
214
|
+
LLM_ARCH_PHI3,
|
214
215
|
LLM_ARCH_PLAMO,
|
215
216
|
LLM_ARCH_CODESHELL,
|
216
217
|
LLM_ARCH_ORION,
|
@@ -246,6 +247,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
|
246
247
|
{ LLM_ARCH_QWEN2, "qwen2" },
|
247
248
|
{ LLM_ARCH_QWEN2MOE, "qwen2moe" },
|
248
249
|
{ LLM_ARCH_PHI2, "phi2" },
|
250
|
+
{ LLM_ARCH_PHI3, "phi3" },
|
249
251
|
{ LLM_ARCH_PLAMO, "plamo" },
|
250
252
|
{ LLM_ARCH_CODESHELL, "codeshell" },
|
251
253
|
{ LLM_ARCH_ORION, "orion" },
|
@@ -314,6 +316,7 @@ enum llm_kv {
|
|
314
316
|
LLM_KV_SSM_TIME_STEP_RANK,
|
315
317
|
|
316
318
|
LLM_KV_TOKENIZER_MODEL,
|
319
|
+
LLM_KV_TOKENIZER_PRE,
|
317
320
|
LLM_KV_TOKENIZER_LIST,
|
318
321
|
LLM_KV_TOKENIZER_TOKEN_TYPE,
|
319
322
|
LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT,
|
@@ -390,6 +393,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
|
390
393
|
{ LLM_KV_SSM_TIME_STEP_RANK, "%s.ssm.time_step_rank" },
|
391
394
|
|
392
395
|
{ LLM_KV_TOKENIZER_MODEL, "tokenizer.ggml.model" },
|
396
|
+
{ LLM_KV_TOKENIZER_PRE, "tokenizer.ggml.pre" },
|
393
397
|
{ LLM_KV_TOKENIZER_LIST, "tokenizer.ggml.tokens" },
|
394
398
|
{ LLM_KV_TOKENIZER_TOKEN_TYPE, "tokenizer.ggml.token_type" },
|
395
399
|
{ LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT, "tokenizer.ggml.token_type_count" },
|
@@ -793,6 +797,23 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
|
|
793
797
|
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
794
798
|
},
|
795
799
|
},
|
800
|
+
{
|
801
|
+
LLM_ARCH_PHI3,
|
802
|
+
{
|
803
|
+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
804
|
+
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
805
|
+
{ LLM_TENSOR_OUTPUT, "output" },
|
806
|
+
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
807
|
+
{ LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
|
808
|
+
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
809
|
+
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
810
|
+
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
811
|
+
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
812
|
+
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
813
|
+
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
814
|
+
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
815
|
+
},
|
816
|
+
},
|
796
817
|
{
|
797
818
|
LLM_ARCH_PLAMO,
|
798
819
|
{
|
@@ -1600,12 +1621,12 @@ struct llama_mlock {
|
|
1600
1621
|
};
|
1601
1622
|
using llama_mlocks = std::vector<std::unique_ptr<llama_mlock>>;
|
1602
1623
|
|
1603
|
-
static std::string llama_token_to_piece(const struct llama_context * ctx, llama_token token) {
|
1624
|
+
static std::string llama_token_to_piece(const struct llama_context * ctx, llama_token token, bool special) {
|
1604
1625
|
std::vector<char> result(8, 0);
|
1605
|
-
const int n_tokens = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size());
|
1626
|
+
const int n_tokens = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size(), special);
|
1606
1627
|
if (n_tokens < 0) {
|
1607
1628
|
result.resize(-n_tokens);
|
1608
|
-
int check = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size());
|
1629
|
+
int check = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size(), special);
|
1609
1630
|
GGML_ASSERT(check == -n_tokens);
|
1610
1631
|
}
|
1611
1632
|
else {
|
@@ -1824,7 +1845,7 @@ struct llama_hparams {
|
|
1824
1845
|
float f_logit_scale = 0.0f;
|
1825
1846
|
|
1826
1847
|
bool causal_attn = true;
|
1827
|
-
bool
|
1848
|
+
bool use_alibi = false; // currently, we need KQ_pos data for ALiBi-based models
|
1828
1849
|
|
1829
1850
|
enum llama_pooling_type pooling_type = LLAMA_POOLING_TYPE_NONE;
|
1830
1851
|
enum llama_rope_type rope_type = LLAMA_ROPE_TYPE_NONE;
|
@@ -1914,6 +1935,7 @@ struct llama_cparams {
|
|
1914
1935
|
bool embeddings;
|
1915
1936
|
bool causal_attn;
|
1916
1937
|
bool offload_kqv;
|
1938
|
+
bool flash_attn;
|
1917
1939
|
|
1918
1940
|
enum llama_pooling_type pooling_type;
|
1919
1941
|
|
@@ -2017,8 +2039,8 @@ struct llama_kv_cache {
|
|
2017
2039
|
bool has_shift = false;
|
2018
2040
|
bool do_defrag = false;
|
2019
2041
|
bool do_copy = false;
|
2020
|
-
// with recurrent state models, a cell can hold the state for more than one past token
|
2021
|
-
bool
|
2042
|
+
bool recurrent = false; // with recurrent state models, a cell can hold the state for more than one past token
|
2043
|
+
bool v_trans = true; // the value tensor is transposed
|
2022
2044
|
|
2023
2045
|
// Note: The value of head isn't only used to optimize searching
|
2024
2046
|
// for a free KV slot. llama_decode_internal also uses it, so it
|
@@ -2095,7 +2117,8 @@ struct llama_vocab {
|
|
2095
2117
|
ttype type;
|
2096
2118
|
};
|
2097
2119
|
|
2098
|
-
enum llama_vocab_type
|
2120
|
+
enum llama_vocab_type type = LLAMA_VOCAB_TYPE_SPM;
|
2121
|
+
enum llama_vocab_pre_type type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
|
2099
2122
|
|
2100
2123
|
std::unordered_map<token, id> token_to_id;
|
2101
2124
|
std::vector<token_data> id_to_token;
|
@@ -2120,7 +2143,7 @@ struct llama_vocab {
|
|
2120
2143
|
id special_prefix_id = -1;
|
2121
2144
|
id special_suffix_id = -1;
|
2122
2145
|
id special_middle_id = -1;
|
2123
|
-
id special_eot_id = -1;
|
2146
|
+
id special_eot_id = -1; // TODO: move above after "eos_id", and here add "file separator" token
|
2124
2147
|
|
2125
2148
|
bool add_space_prefix = true;
|
2126
2149
|
|
@@ -2316,11 +2339,14 @@ struct llama_context {
|
|
2316
2339
|
|
2317
2340
|
static bool llama_kv_cache_init(
|
2318
2341
|
struct llama_kv_cache & cache,
|
2319
|
-
|
2342
|
+
const llama_context * ctx,
|
2320
2343
|
ggml_type type_k,
|
2321
2344
|
ggml_type type_v,
|
2322
2345
|
uint32_t kv_size,
|
2323
2346
|
bool offload) {
|
2347
|
+
const llama_model & model = ctx->model;
|
2348
|
+
const llama_cparams & cparams = ctx->cparams;
|
2349
|
+
|
2324
2350
|
const struct llama_hparams & hparams = model.hparams;
|
2325
2351
|
|
2326
2352
|
const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa() + hparams.n_embd_k_s();
|
@@ -2331,8 +2357,9 @@ static bool llama_kv_cache_init(
|
|
2331
2357
|
|
2332
2358
|
// TODO: find a nicer way to add other recurrent model architectures
|
2333
2359
|
cache.recurrent = model.arch == LLM_ARCH_MAMBA;
|
2360
|
+
cache.v_trans = !cparams.flash_attn;
|
2334
2361
|
|
2335
|
-
// TODO: support mixed
|
2362
|
+
// TODO: support mixed recurrent Transformer architectures
|
2336
2363
|
// NOTE: (!a || b) is a logical implication (a -> b)
|
2337
2364
|
GGML_ASSERT(!cache.recurrent || n_embd_k_gqa == hparams.n_embd_k_s());
|
2338
2365
|
GGML_ASSERT(!cache.recurrent || n_embd_v_gqa == hparams.n_embd_v_s());
|
@@ -2543,6 +2570,10 @@ static void llama_kv_cache_clear(struct llama_kv_cache & cache) {
|
|
2543
2570
|
}
|
2544
2571
|
cache.head = 0;
|
2545
2572
|
cache.used = 0;
|
2573
|
+
|
2574
|
+
for (auto & buf : cache.bufs) {
|
2575
|
+
ggml_backend_buffer_clear(buf, 0);
|
2576
|
+
}
|
2546
2577
|
}
|
2547
2578
|
|
2548
2579
|
static bool llama_kv_cache_seq_rm(
|
@@ -2863,6 +2894,7 @@ namespace GGUFMeta {
|
|
2863
2894
|
case LLAMA_KV_OVERRIDE_TYPE_BOOL: return "bool";
|
2864
2895
|
case LLAMA_KV_OVERRIDE_TYPE_INT: return "int";
|
2865
2896
|
case LLAMA_KV_OVERRIDE_TYPE_FLOAT: return "float";
|
2897
|
+
case LLAMA_KV_OVERRIDE_TYPE_STR: return "str";
|
2866
2898
|
}
|
2867
2899
|
return "unknown";
|
2868
2900
|
}
|
@@ -2874,13 +2906,16 @@ namespace GGUFMeta {
|
|
2874
2906
|
__func__, override_type_to_str(ovrd->tag), ovrd->key);
|
2875
2907
|
switch (ovrd->tag) {
|
2876
2908
|
case LLAMA_KV_OVERRIDE_TYPE_BOOL: {
|
2877
|
-
LLAMA_LOG_INFO("%s\n", ovrd->
|
2909
|
+
LLAMA_LOG_INFO("%s\n", ovrd->val_bool ? "true" : "false");
|
2878
2910
|
} break;
|
2879
2911
|
case LLAMA_KV_OVERRIDE_TYPE_INT: {
|
2880
|
-
LLAMA_LOG_INFO("%" PRId64 "\n", ovrd->
|
2912
|
+
LLAMA_LOG_INFO("%" PRId64 "\n", ovrd->val_i64);
|
2881
2913
|
} break;
|
2882
2914
|
case LLAMA_KV_OVERRIDE_TYPE_FLOAT: {
|
2883
|
-
LLAMA_LOG_INFO("%.6f\n", ovrd->
|
2915
|
+
LLAMA_LOG_INFO("%.6f\n", ovrd->val_f64);
|
2916
|
+
} break;
|
2917
|
+
case LLAMA_KV_OVERRIDE_TYPE_STR: {
|
2918
|
+
LLAMA_LOG_INFO("%s\n", ovrd->val_str);
|
2884
2919
|
} break;
|
2885
2920
|
default:
|
2886
2921
|
// Shouldn't be possible to end up here, but just in case...
|
@@ -2899,7 +2934,7 @@ namespace GGUFMeta {
|
|
2899
2934
|
static typename std::enable_if<std::is_same<OT, bool>::value, bool>::type
|
2900
2935
|
try_override(OT & target, const struct llama_model_kv_override * ovrd) {
|
2901
2936
|
if (validate_override(LLAMA_KV_OVERRIDE_TYPE_BOOL, ovrd)) {
|
2902
|
-
target = ovrd->
|
2937
|
+
target = ovrd->val_bool;
|
2903
2938
|
return true;
|
2904
2939
|
}
|
2905
2940
|
return false;
|
@@ -2909,7 +2944,7 @@ namespace GGUFMeta {
|
|
2909
2944
|
static typename std::enable_if<!std::is_same<OT, bool>::value && std::is_integral<OT>::value, bool>::type
|
2910
2945
|
try_override(OT & target, const struct llama_model_kv_override * ovrd) {
|
2911
2946
|
if (validate_override(LLAMA_KV_OVERRIDE_TYPE_INT, ovrd)) {
|
2912
|
-
target = ovrd->
|
2947
|
+
target = ovrd->val_i64;
|
2913
2948
|
return true;
|
2914
2949
|
}
|
2915
2950
|
return false;
|
@@ -2919,7 +2954,7 @@ namespace GGUFMeta {
|
|
2919
2954
|
static typename std::enable_if<std::is_floating_point<OT>::value, bool>::type
|
2920
2955
|
try_override(T & target, const struct llama_model_kv_override * ovrd) {
|
2921
2956
|
if (validate_override(LLAMA_KV_OVERRIDE_TYPE_FLOAT, ovrd)) {
|
2922
|
-
target = ovrd->
|
2957
|
+
target = ovrd->val_f64;
|
2923
2958
|
return true;
|
2924
2959
|
}
|
2925
2960
|
return false;
|
@@ -2928,12 +2963,11 @@ namespace GGUFMeta {
|
|
2928
2963
|
template<typename OT>
|
2929
2964
|
static typename std::enable_if<std::is_same<OT, std::string>::value, bool>::type
|
2930
2965
|
try_override(T & target, const struct llama_model_kv_override * ovrd) {
|
2931
|
-
(
|
2932
|
-
|
2933
|
-
|
2934
|
-
|
2935
|
-
|
2936
|
-
ovrd ? ovrd->key : "NULL"));
|
2966
|
+
if (validate_override(LLAMA_KV_OVERRIDE_TYPE_STR, ovrd)) {
|
2967
|
+
target = ovrd->val_str;
|
2968
|
+
return true;
|
2969
|
+
}
|
2970
|
+
return false;
|
2937
2971
|
}
|
2938
2972
|
|
2939
2973
|
static bool set(const gguf_context * ctx, const int k, T & target, const struct llama_model_kv_override * ovrd = nullptr) {
|
@@ -2966,6 +3000,7 @@ struct llama_model_loader {
|
|
2966
3000
|
size_t n_bytes = 0;
|
2967
3001
|
|
2968
3002
|
bool use_mmap = false;
|
3003
|
+
bool check_tensors;
|
2969
3004
|
|
2970
3005
|
llama_files files;
|
2971
3006
|
llama_ftype ftype;
|
@@ -2980,9 +3015,13 @@ struct llama_model_loader {
|
|
2980
3015
|
|
2981
3016
|
ggml_tensor * tensor;
|
2982
3017
|
|
2983
|
-
llama_tensor_weight(uint16_t idx, const char * name, const struct gguf_context * gguf_ctx, ggml_tensor * tensor) : idx(idx), tensor(tensor) {
|
3018
|
+
llama_tensor_weight(const llama_file * file, uint16_t idx, const char * name, const struct gguf_context * gguf_ctx, ggml_tensor * tensor) : idx(idx), tensor(tensor) {
|
2984
3019
|
const int tensor_idx = gguf_find_tensor(gguf_ctx, name);
|
2985
3020
|
offs = gguf_get_data_offset(gguf_ctx) + gguf_get_tensor_offset(gguf_ctx, tensor_idx);
|
3021
|
+
|
3022
|
+
if (offs + ggml_nbytes(tensor) < offs || offs + ggml_nbytes(tensor) > file->size) {
|
3023
|
+
throw std::runtime_error(format("tensor '%s' data is not within the file bounds, model is corrupted or incomplete", name));
|
3024
|
+
}
|
2986
3025
|
}
|
2987
3026
|
};
|
2988
3027
|
std::vector<llama_tensor_weight> weights;
|
@@ -2995,7 +3034,7 @@ struct llama_model_loader {
|
|
2995
3034
|
std::string arch_name;
|
2996
3035
|
LLM_KV llm_kv = LLM_KV(LLM_ARCH_UNKNOWN);
|
2997
3036
|
|
2998
|
-
llama_model_loader(const std::string & fname, bool use_mmap, const struct llama_model_kv_override * param_overrides_p) {
|
3037
|
+
llama_model_loader(const std::string & fname, bool use_mmap, bool check_tensors, const struct llama_model_kv_override * param_overrides_p) {
|
2999
3038
|
int trace = 0;
|
3000
3039
|
if (getenv("LLAMA_TRACE")) {
|
3001
3040
|
trace = atoi(getenv("LLAMA_TRACE"));
|
@@ -3021,15 +3060,15 @@ struct llama_model_loader {
|
|
3021
3060
|
get_key(llm_kv(LLM_KV_GENERAL_ARCHITECTURE), arch_name, false);
|
3022
3061
|
llm_kv = LLM_KV(llm_arch_from_string(arch_name));
|
3023
3062
|
|
3063
|
+
files.emplace_back(new llama_file(fname.c_str(), "rb"));
|
3064
|
+
contexts.emplace_back(ctx);
|
3065
|
+
|
3024
3066
|
// Save tensors data offset of the main file.
|
3025
3067
|
// For subsidiary files, `meta` tensor data offset must not be used,
|
3026
3068
|
// so we build a unified tensors index for weights.
|
3027
3069
|
for (ggml_tensor * cur = ggml_get_first_tensor(ctx); cur; cur = ggml_get_next_tensor(ctx, cur)) {
|
3028
|
-
weights.emplace_back(0, cur->name, meta, cur);
|
3070
|
+
weights.emplace_back(files.back().get(), 0, cur->name, meta, cur);
|
3029
3071
|
}
|
3030
|
-
files.emplace_back(new llama_file(fname.c_str(), "rb"));
|
3031
|
-
contexts.emplace_back(ctx);
|
3032
|
-
|
3033
3072
|
uint16_t n_split = 0;
|
3034
3073
|
get_key(llm_kv(LLM_KV_SPLIT_COUNT), n_split, false);
|
3035
3074
|
|
@@ -3063,12 +3102,13 @@ struct llama_model_loader {
|
|
3063
3102
|
throw std::runtime_error(format("%s: failed to load GGUF split from %s\n", __func__, split_path));
|
3064
3103
|
}
|
3065
3104
|
|
3105
|
+
files.emplace_back(new llama_file(split_path, "rb"));
|
3106
|
+
contexts.emplace_back(ctx);
|
3107
|
+
|
3066
3108
|
// Save tensors data offset info of the shard.
|
3067
3109
|
for (ggml_tensor * cur = ggml_get_first_tensor(ctx); cur; cur = ggml_get_next_tensor(ctx, cur)) {
|
3068
|
-
weights.emplace_back(idx, cur->name, ctx_gguf, cur);
|
3110
|
+
weights.emplace_back(files.back().get(), idx, cur->name, ctx_gguf, cur);
|
3069
3111
|
}
|
3070
|
-
files.emplace_back(new llama_file(split_path, "rb"));
|
3071
|
-
contexts.emplace_back(ctx);
|
3072
3112
|
|
3073
3113
|
gguf_free(ctx_gguf);
|
3074
3114
|
}
|
@@ -3091,9 +3131,17 @@ struct llama_model_loader {
|
|
3091
3131
|
|
3092
3132
|
fver = (enum llama_fver) gguf_get_version(meta);
|
3093
3133
|
|
3134
|
+
std::set<std::string> tensor_names;
|
3094
3135
|
for (auto & w : weights) {
|
3095
3136
|
n_elements += ggml_nelements(w.tensor);
|
3096
3137
|
n_bytes += ggml_nbytes(w.tensor);
|
3138
|
+
// make sure there is no duplicated tensor names
|
3139
|
+
const std::string name(w.tensor->name);
|
3140
|
+
auto found = tensor_names.find(name);
|
3141
|
+
if (found != tensor_names.end()) {
|
3142
|
+
throw std::runtime_error(format("invalid model: tensor '%s' is duplicated", w.tensor->name));
|
3143
|
+
}
|
3144
|
+
tensor_names.insert(name);
|
3097
3145
|
}
|
3098
3146
|
|
3099
3147
|
LLAMA_LOG_INFO("%s: loaded meta data with %d key-value pairs and %d tensors from %s (version %s)\n",
|
@@ -3199,6 +3247,7 @@ struct llama_model_loader {
|
|
3199
3247
|
}
|
3200
3248
|
|
3201
3249
|
this->use_mmap = use_mmap;
|
3250
|
+
this->check_tensors = check_tensors;
|
3202
3251
|
}
|
3203
3252
|
|
3204
3253
|
~llama_model_loader() {
|
@@ -3278,6 +3327,10 @@ struct llama_model_loader {
|
|
3278
3327
|
return nullptr;
|
3279
3328
|
}
|
3280
3329
|
|
3330
|
+
const llama_tensor_weight * get_weight(int i) const {
|
3331
|
+
return get_weight(get_tensor_name(i));
|
3332
|
+
}
|
3333
|
+
|
3281
3334
|
const llama_tensor_weight & require_weight(const char * name) const {
|
3282
3335
|
const llama_tensor_weight * weight = get_weight(name);
|
3283
3336
|
if (!weight) {
|
@@ -3453,6 +3506,10 @@ struct llama_model_loader {
|
|
3453
3506
|
file->seek(w.offs, SEEK_SET);
|
3454
3507
|
file->read_raw(cur->data, ggml_nbytes(cur));
|
3455
3508
|
}
|
3509
|
+
|
3510
|
+
if (check_tensors && !ggml_validate_row_data(cur->type, cur->data, ggml_nbytes(cur))) {
|
3511
|
+
throw std::runtime_error(format("tensor '%s' has invalid data", ggml_get_name(cur)));
|
3512
|
+
}
|
3456
3513
|
}
|
3457
3514
|
|
3458
3515
|
size_t size_done = 0;
|
@@ -3469,6 +3526,8 @@ struct llama_model_loader {
|
|
3469
3526
|
GGML_ASSERT(size_data != 0 && "call init_mappings() first");
|
3470
3527
|
|
3471
3528
|
std::vector<no_init<uint8_t>> read_buf;
|
3529
|
+
std::vector<std::future<std::pair<ggml_tensor *, bool>>> validation_result;
|
3530
|
+
|
3472
3531
|
for (struct ggml_tensor * cur = ggml_get_first_tensor(ctx); cur != NULL; cur = ggml_get_next_tensor(ctx, cur)) {
|
3473
3532
|
const auto * weight = get_weight(ggml_get_name(cur));
|
3474
3533
|
if (weight == nullptr) {
|
@@ -3490,37 +3549,66 @@ struct llama_model_loader {
|
|
3490
3549
|
if (bufs_mmap.count(weight->idx)) {
|
3491
3550
|
buf_mmap = bufs_mmap.at(weight->idx);
|
3492
3551
|
}
|
3552
|
+
uint8_t * data = (uint8_t *) mapping->addr + weight->offs;
|
3553
|
+
|
3554
|
+
if (check_tensors) {
|
3555
|
+
validation_result.emplace_back(std::async(std::launch::async, [cur, data, n_size] {
|
3556
|
+
return std::make_pair(cur, ggml_validate_row_data(cur->type, data, n_size));
|
3557
|
+
}));
|
3558
|
+
}
|
3559
|
+
|
3493
3560
|
GGML_ASSERT(buf_mmap || cur->data); // either we have a buffer to allocate the tensor in, or it is already allocated
|
3494
3561
|
if (buf_mmap && cur->data == nullptr) {
|
3495
|
-
ggml_backend_tensor_alloc(buf_mmap, cur,
|
3562
|
+
ggml_backend_tensor_alloc(buf_mmap, cur, data);
|
3496
3563
|
if (lmlocks) {
|
3497
3564
|
const auto & lmlock = lmlocks->at(weight->idx);
|
3498
|
-
lmlock->grow_to(weight->offs +
|
3565
|
+
lmlock->grow_to(weight->offs + n_size);
|
3499
3566
|
}
|
3500
3567
|
|
3501
3568
|
auto & mmap_used = mmaps_used[weight->idx];
|
3502
3569
|
mmap_used.first = std::min(mmap_used.first, weight->offs);
|
3503
3570
|
mmap_used.second = std::max(mmap_used.second, weight->offs + n_size);
|
3504
3571
|
} else {
|
3505
|
-
ggml_backend_tensor_set(cur,
|
3572
|
+
ggml_backend_tensor_set(cur, data, 0, n_size);
|
3506
3573
|
}
|
3507
3574
|
} else {
|
3508
3575
|
GGML_ASSERT(weight->idx < files.size());
|
3509
3576
|
const auto & file = files.at(weight->idx);
|
3510
3577
|
if (ggml_backend_buffer_is_host(cur->buffer)) {
|
3511
3578
|
file->seek(weight->offs, SEEK_SET);
|
3512
|
-
file->read_raw(cur->data,
|
3579
|
+
file->read_raw(cur->data, n_size);
|
3580
|
+
if (check_tensors) {
|
3581
|
+
validation_result.emplace_back(std::async(std::launch::async, [cur, n_size] {
|
3582
|
+
return std::make_pair(cur, ggml_validate_row_data(cur->type, cur->data, n_size));
|
3583
|
+
}));
|
3584
|
+
}
|
3513
3585
|
} else {
|
3514
|
-
read_buf.resize(
|
3586
|
+
read_buf.resize(n_size);
|
3515
3587
|
file->seek(weight->offs, SEEK_SET);
|
3516
|
-
file->read_raw(read_buf.data(),
|
3588
|
+
file->read_raw(read_buf.data(), n_size);
|
3517
3589
|
ggml_backend_tensor_set(cur, read_buf.data(), 0, n_size);
|
3590
|
+
if (check_tensors && !ggml_validate_row_data(cur->type, read_buf.data(), n_size)) {
|
3591
|
+
throw std::runtime_error(format("tensor '%s' has invalid data", ggml_get_name(cur)));
|
3592
|
+
}
|
3518
3593
|
}
|
3519
3594
|
}
|
3520
3595
|
|
3521
3596
|
size_done += n_size;
|
3522
3597
|
}
|
3523
3598
|
|
3599
|
+
// check validation results
|
3600
|
+
bool validation_failed = false;
|
3601
|
+
for (auto & future : validation_result) {
|
3602
|
+
auto result = future.get();
|
3603
|
+
if (!result.second) {
|
3604
|
+
LLAMA_LOG_ERROR("%s: tensor '%s' has invalid data\n", __func__, ggml_get_name(result.first));
|
3605
|
+
validation_failed = true;
|
3606
|
+
}
|
3607
|
+
}
|
3608
|
+
if (validation_failed) {
|
3609
|
+
throw std::runtime_error("found tensors with invalid data");
|
3610
|
+
}
|
3611
|
+
|
3524
3612
|
// check if this is the last call and do final cleanup
|
3525
3613
|
if (size_done >= size_data) {
|
3526
3614
|
// unmap offloaded tensors and metadata
|
@@ -3770,7 +3858,7 @@ static void llm_load_hparams(
|
|
3770
3858
|
switch (hparams.n_layer) {
|
3771
3859
|
case 22: model.type = e_model::MODEL_1B; break;
|
3772
3860
|
case 26: model.type = e_model::MODEL_3B; break;
|
3773
|
-
case 32: model.type = e_model::MODEL_7B; break;
|
3861
|
+
case 32: model.type = hparams.n_head == hparams.n_head_kv ? e_model::MODEL_7B : e_model::MODEL_8B; break; // LLaMa 8B v3 uses GQA
|
3774
3862
|
case 40: model.type = e_model::MODEL_13B; break;
|
3775
3863
|
case 48: model.type = e_model::MODEL_34B; break;
|
3776
3864
|
case 60: model.type = e_model::MODEL_30B; break;
|
@@ -3955,6 +4043,16 @@ static void llm_load_hparams(
|
|
3955
4043
|
{
|
3956
4044
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
3957
4045
|
|
4046
|
+
switch (hparams.n_layer) {
|
4047
|
+
case 24: model.type = e_model::MODEL_1B; break;
|
4048
|
+
case 32: model.type = e_model::MODEL_3B; break;
|
4049
|
+
default: model.type = e_model::MODEL_UNKNOWN;
|
4050
|
+
}
|
4051
|
+
} break;
|
4052
|
+
case LLM_ARCH_PHI3:
|
4053
|
+
{
|
4054
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
4055
|
+
|
3958
4056
|
switch (hparams.n_layer) {
|
3959
4057
|
case 24: model.type = e_model::MODEL_1B; break;
|
3960
4058
|
case 32: model.type = e_model::MODEL_3B; break;
|
@@ -4104,7 +4202,7 @@ static void llm_load_hparams(
|
|
4104
4202
|
model.ftype = ml.ftype;
|
4105
4203
|
|
4106
4204
|
if (hparams.f_max_alibi_bias > 0.0f) {
|
4107
|
-
hparams.
|
4205
|
+
hparams.use_alibi = true;
|
4108
4206
|
}
|
4109
4207
|
|
4110
4208
|
hparams.rope_type = llama_rope_type(&model);
|
@@ -4127,11 +4225,13 @@ static void llm_load_vocab(
|
|
4127
4225
|
|
4128
4226
|
// determine vocab type
|
4129
4227
|
{
|
4130
|
-
std::string
|
4228
|
+
std::string tokenizer_model;
|
4229
|
+
std::string tokenizer_pre;
|
4131
4230
|
|
4132
|
-
ml.get_key(LLM_KV_TOKENIZER_MODEL,
|
4231
|
+
ml.get_key(LLM_KV_TOKENIZER_MODEL, tokenizer_model);
|
4232
|
+
ml.get_key(LLM_KV_TOKENIZER_PRE, tokenizer_pre, false);
|
4133
4233
|
|
4134
|
-
if (
|
4234
|
+
if (tokenizer_model == "no_vocab") {
|
4135
4235
|
vocab.type = LLAMA_VOCAB_TYPE_NONE;
|
4136
4236
|
|
4137
4237
|
// default special tokens
|
@@ -4145,7 +4245,7 @@ static void llm_load_vocab(
|
|
4145
4245
|
vocab.linefeed_id = -1;
|
4146
4246
|
|
4147
4247
|
return;
|
4148
|
-
} else if (
|
4248
|
+
} else if (tokenizer_model == "llama") {
|
4149
4249
|
vocab.type = LLAMA_VOCAB_TYPE_SPM;
|
4150
4250
|
|
4151
4251
|
// default special tokens
|
@@ -4179,7 +4279,10 @@ static void llm_load_vocab(
|
|
4179
4279
|
vocab.special_prefix_id = 67;
|
4180
4280
|
vocab.special_suffix_id = 69;
|
4181
4281
|
vocab.special_middle_id = 68;
|
4182
|
-
|
4282
|
+
// TODO: this is not EOT, it is "file separator" token, needs fix
|
4283
|
+
// https://huggingface.co/google/codegemma-7b-it/blob/9b1d9231388358c04d90bd003458f5070d97db44/tokenizer_config.json#L565-L572
|
4284
|
+
//vocab.special_eot_id = 70;
|
4285
|
+
vocab.special_eot_id = 107;
|
4183
4286
|
}
|
4184
4287
|
}
|
4185
4288
|
|
@@ -4187,9 +4290,27 @@ static void llm_load_vocab(
|
|
4187
4290
|
if (add_space_prefix_keyidx != -1) {
|
4188
4291
|
vocab.add_space_prefix = gguf_get_val_bool(ctx, add_space_prefix_keyidx);
|
4189
4292
|
} // The default value of add_space_prefix is true.
|
4190
|
-
} else if (
|
4191
|
-
vocab.type =
|
4293
|
+
} else if (tokenizer_model == "bert") {
|
4294
|
+
vocab.type = LLAMA_VOCAB_TYPE_WPM;
|
4192
4295
|
|
4296
|
+
// default special tokens
|
4297
|
+
vocab.special_bos_id = -1;
|
4298
|
+
vocab.special_eos_id = -1;
|
4299
|
+
vocab.special_unk_id = 100;
|
4300
|
+
vocab.special_sep_id = 102;
|
4301
|
+
vocab.special_pad_id = 0;
|
4302
|
+
vocab.special_cls_id = 101;
|
4303
|
+
vocab.special_mask_id = 103;
|
4304
|
+
vocab.add_space_prefix = false;
|
4305
|
+
} else {
|
4306
|
+
if (tokenizer_model == "gpt2") {
|
4307
|
+
vocab.type = LLAMA_VOCAB_TYPE_BPE;
|
4308
|
+
} else {
|
4309
|
+
LLAMA_LOG_WARN("%s: unknown tokenizer: '%s'", __func__, tokenizer_model.c_str());
|
4310
|
+
LLAMA_LOG_WARN("%s: using default tokenizer: 'llama'", __func__);
|
4311
|
+
vocab.type = LLAMA_VOCAB_TYPE_SPM;
|
4312
|
+
return;
|
4313
|
+
}
|
4193
4314
|
// read bpe merges and populate bpe ranks
|
4194
4315
|
const int merges_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_MERGES).c_str());
|
4195
4316
|
if (merges_keyidx == -1) {
|
@@ -4223,23 +4344,50 @@ static void llm_load_vocab(
|
|
4223
4344
|
vocab.special_pad_id = -1;
|
4224
4345
|
vocab.special_cls_id = -1;
|
4225
4346
|
vocab.special_mask_id = -1;
|
4226
|
-
}
|
4227
|
-
vocab.type = LLAMA_VOCAB_TYPE_WPM;
|
4347
|
+
}
|
4228
4348
|
|
4229
|
-
|
4230
|
-
|
4231
|
-
|
4232
|
-
|
4233
|
-
|
4234
|
-
|
4235
|
-
|
4236
|
-
|
4237
|
-
|
4349
|
+
// for now, only BPE models have pre-tokenizers
|
4350
|
+
if (vocab.type == LLAMA_VOCAB_TYPE_BPE) {
|
4351
|
+
if (tokenizer_pre.empty()) {
|
4352
|
+
LLAMA_LOG_WARN("%s: missing pre-tokenizer type, using: 'default'\n", __func__);
|
4353
|
+
LLAMA_LOG_WARN("%s: \n", __func__);
|
4354
|
+
LLAMA_LOG_WARN("%s: ************************************ \n", __func__);
|
4355
|
+
LLAMA_LOG_WARN("%s: GENERATION QUALITY WILL BE DEGRADED! \n", __func__);
|
4356
|
+
LLAMA_LOG_WARN("%s: CONSIDER REGENERATING THE MODEL \n", __func__);
|
4357
|
+
LLAMA_LOG_WARN("%s: ************************************ \n", __func__);
|
4358
|
+
LLAMA_LOG_WARN("%s: \n", __func__);
|
4359
|
+
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
|
4360
|
+
} else if (
|
4361
|
+
tokenizer_pre == "default") {
|
4362
|
+
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
|
4363
|
+
} else if (
|
4364
|
+
tokenizer_pre == "llama3" ||
|
4365
|
+
tokenizer_pre == "llama-v3" ||
|
4366
|
+
tokenizer_pre == "llama-bpe") {
|
4367
|
+
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_LLAMA3;
|
4368
|
+
} else if (
|
4369
|
+
tokenizer_pre == "deepseek-llm") {
|
4370
|
+
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM;
|
4371
|
+
} else if (
|
4372
|
+
tokenizer_pre == "deepseek-coder") {
|
4373
|
+
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER;
|
4374
|
+
} else if (
|
4375
|
+
tokenizer_pre == "falcon") {
|
4376
|
+
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_FALCON;
|
4377
|
+
} else if (
|
4378
|
+
tokenizer_pre == "mpt") {
|
4379
|
+
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_MPT;
|
4380
|
+
} else if (
|
4381
|
+
tokenizer_pre == "starcoder") {
|
4382
|
+
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_STARCODER;
|
4383
|
+
} else if (
|
4384
|
+
tokenizer_pre == "gpt-2") {
|
4385
|
+
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_GPT2;
|
4386
|
+
} else {
|
4387
|
+
throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
|
4388
|
+
}
|
4238
4389
|
} else {
|
4239
|
-
|
4240
|
-
LLAMA_LOG_WARN("%s: using default tokenizer: 'llama'", __func__);
|
4241
|
-
|
4242
|
-
vocab.type = LLAMA_VOCAB_TYPE_SPM;
|
4390
|
+
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
|
4243
4391
|
}
|
4244
4392
|
}
|
4245
4393
|
|
@@ -4308,6 +4456,7 @@ static void llm_load_vocab(
|
|
4308
4456
|
{ LLM_KV_TOKENIZER_MIDDLE_ID, vocab.special_middle_id },
|
4309
4457
|
{ LLM_KV_TOKENIZER_EOT_ID, vocab.special_eot_id },
|
4310
4458
|
};
|
4459
|
+
|
4311
4460
|
for (const auto & it : special_token_types) {
|
4312
4461
|
const std::string & key = kv(std::get<0>(it));
|
4313
4462
|
int32_t & id = std::get<1>(it);
|
@@ -4322,7 +4471,6 @@ static void llm_load_vocab(
|
|
4322
4471
|
} else {
|
4323
4472
|
id = new_id;
|
4324
4473
|
}
|
4325
|
-
|
4326
4474
|
}
|
4327
4475
|
|
4328
4476
|
// Handle add_bos_token and add_eos_token
|
@@ -4336,6 +4484,28 @@ static void llm_load_vocab(
|
|
4336
4484
|
vocab.special_add_eos = int(temp);
|
4337
4485
|
}
|
4338
4486
|
}
|
4487
|
+
|
4488
|
+
// find EOT token: "<|eot_id|>", "<|im_end|>", "<end_of_turn>", etc.
|
4489
|
+
//
|
4490
|
+
// TODO: convert scripts should provide this token through the KV metadata LLAMA_KV_TOKENIZER_EOT_ID
|
4491
|
+
// for now, we apply this workaround to find the EOT token based on its text
|
4492
|
+
if (vocab.special_eot_id == -1) {
|
4493
|
+
for (const auto & t : vocab.token_to_id) {
|
4494
|
+
if (
|
4495
|
+
// TODO: gemma "<end_of_turn>" is exported as a normal token, so the following check does not work
|
4496
|
+
// need to fix convert script
|
4497
|
+
//vocab.id_to_token[t.second].type == LLAMA_TOKEN_TYPE_CONTROL &&
|
4498
|
+
(t.first == "<|eot_id|>" ||
|
4499
|
+
t.first == "<|im_end|>" ||
|
4500
|
+
t.first == "<|end|>" ||
|
4501
|
+
t.first == "<end_of_turn>"
|
4502
|
+
)
|
4503
|
+
) {
|
4504
|
+
vocab.special_eot_id = t.second;
|
4505
|
+
break;
|
4506
|
+
}
|
4507
|
+
}
|
4508
|
+
}
|
4339
4509
|
}
|
4340
4510
|
|
4341
4511
|
// build special tokens cache
|
@@ -4498,14 +4668,19 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
|
|
4498
4668
|
LLAMA_LOG_INFO("%s: general.name = %s\n", __func__, model.name.c_str());
|
4499
4669
|
|
4500
4670
|
// special tokens
|
4501
|
-
if (vocab.special_bos_id
|
4502
|
-
if (vocab.special_eos_id
|
4503
|
-
if (vocab.special_unk_id
|
4504
|
-
if (vocab.special_sep_id
|
4505
|
-
if (vocab.special_pad_id
|
4506
|
-
if (vocab.special_cls_id
|
4507
|
-
if (vocab.special_mask_id
|
4508
|
-
|
4671
|
+
if (vocab.special_bos_id != -1) { LLAMA_LOG_INFO( "%s: BOS token = %d '%s'\n", __func__, vocab.special_bos_id, vocab.id_to_token[vocab.special_bos_id].text.c_str() ); }
|
4672
|
+
if (vocab.special_eos_id != -1) { LLAMA_LOG_INFO( "%s: EOS token = %d '%s'\n", __func__, vocab.special_eos_id, vocab.id_to_token[vocab.special_eos_id].text.c_str() ); }
|
4673
|
+
if (vocab.special_unk_id != -1) { LLAMA_LOG_INFO( "%s: UNK token = %d '%s'\n", __func__, vocab.special_unk_id, vocab.id_to_token[vocab.special_unk_id].text.c_str() ); }
|
4674
|
+
if (vocab.special_sep_id != -1) { LLAMA_LOG_INFO( "%s: SEP token = %d '%s'\n", __func__, vocab.special_sep_id, vocab.id_to_token[vocab.special_sep_id].text.c_str() ); }
|
4675
|
+
if (vocab.special_pad_id != -1) { LLAMA_LOG_INFO( "%s: PAD token = %d '%s'\n", __func__, vocab.special_pad_id, vocab.id_to_token[vocab.special_pad_id].text.c_str() ); }
|
4676
|
+
if (vocab.special_cls_id != -1) { LLAMA_LOG_INFO( "%s: CLS token = %d '%s'\n", __func__, vocab.special_cls_id, vocab.id_to_token[vocab.special_cls_id].text.c_str() ); }
|
4677
|
+
if (vocab.special_mask_id != -1) { LLAMA_LOG_INFO( "%s: MASK token = %d '%s'\n", __func__, vocab.special_mask_id, vocab.id_to_token[vocab.special_mask_id].text.c_str() ); }
|
4678
|
+
|
4679
|
+
if (vocab.linefeed_id != -1) { LLAMA_LOG_INFO( "%s: LF token = %d '%s'\n", __func__, vocab.linefeed_id, vocab.id_to_token[vocab.linefeed_id].text.c_str() ); }
|
4680
|
+
if (vocab.special_prefix_id != -1) { LLAMA_LOG_INFO( "%s: PRE token = %d '%s'\n", __func__, vocab.special_prefix_id, vocab.id_to_token[vocab.special_prefix_id].text.c_str() ); }
|
4681
|
+
if (vocab.special_suffix_id != -1) { LLAMA_LOG_INFO( "%s: SUF token = %d '%s'\n", __func__, vocab.special_suffix_id, vocab.id_to_token[vocab.special_suffix_id].text.c_str() ); }
|
4682
|
+
if (vocab.special_middle_id != -1) { LLAMA_LOG_INFO( "%s: MID token = %d '%s'\n", __func__, vocab.special_middle_id, vocab.id_to_token[vocab.special_middle_id].text.c_str() ); }
|
4683
|
+
if (vocab.special_eot_id != -1) { LLAMA_LOG_INFO( "%s: EOT token = %d '%s'\n", __func__, vocab.special_eot_id, vocab.id_to_token[vocab.special_eot_id].text.c_str() ); }
|
4509
4684
|
}
|
4510
4685
|
|
4511
4686
|
// Returns false if cancelled by progress_callback
|
@@ -5346,6 +5521,33 @@ static bool llm_load_tensors(
|
|
5346
5521
|
layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff});
|
5347
5522
|
}
|
5348
5523
|
} break;
|
5524
|
+
case LLM_ARCH_PHI3:
|
5525
|
+
{
|
5526
|
+
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab });
|
5527
|
+
|
5528
|
+
// output
|
5529
|
+
{
|
5530
|
+
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd });
|
5531
|
+
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), { n_embd, n_vocab });
|
5532
|
+
}
|
5533
|
+
|
5534
|
+
for (int i = 0; i < n_layer; ++i) {
|
5535
|
+
ggml_context* ctx_layer = ctx_for_layer(i);
|
5536
|
+
ggml_context* ctx_split = ctx_for_layer_split(i);
|
5537
|
+
|
5538
|
+
auto& layer = model.layers[i];
|
5539
|
+
|
5540
|
+
layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), { n_embd });
|
5541
|
+
|
5542
|
+
layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), { n_embd, n_embd + 2 * n_embd_gqa }, false);
|
5543
|
+
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd, n_embd });
|
5544
|
+
|
5545
|
+
layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), { n_embd });
|
5546
|
+
|
5547
|
+
layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd });
|
5548
|
+
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), { n_embd, 2 * n_ff });
|
5549
|
+
}
|
5550
|
+
} break;
|
5349
5551
|
case LLM_ARCH_PLAMO:
|
5350
5552
|
{
|
5351
5553
|
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
@@ -5880,7 +6082,7 @@ static bool llm_load_tensors(
|
|
5880
6082
|
// Returns 0 on success, -1 on error, and -2 on cancellation via llama_progress_callback
|
5881
6083
|
static int llama_model_load(const std::string & fname, llama_model & model, llama_model_params & params) {
|
5882
6084
|
try {
|
5883
|
-
llama_model_loader ml(fname, params.use_mmap, params.kv_overrides);
|
6085
|
+
llama_model_loader ml(fname, params.use_mmap, params.check_tensors, params.kv_overrides);
|
5884
6086
|
|
5885
6087
|
model.hparams.vocab_only = params.vocab_only;
|
5886
6088
|
|
@@ -6009,37 +6211,47 @@ static struct ggml_tensor * llm_build_inp_embd(
|
|
6009
6211
|
static void llm_build_kv_store(
|
6010
6212
|
struct ggml_context * ctx,
|
6011
6213
|
const llama_hparams & hparams,
|
6214
|
+
const llama_cparams & cparams,
|
6012
6215
|
const llama_kv_cache & kv,
|
6013
6216
|
struct ggml_cgraph * graph,
|
6014
6217
|
struct ggml_tensor * k_cur,
|
6015
6218
|
struct ggml_tensor * v_cur,
|
6016
|
-
int64_t n_ctx,
|
6017
6219
|
int32_t n_tokens,
|
6018
6220
|
int32_t kv_head,
|
6019
6221
|
const llm_build_cb & cb,
|
6020
6222
|
int64_t il) {
|
6223
|
+
const int64_t n_ctx = cparams.n_ctx;
|
6224
|
+
|
6021
6225
|
const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa();
|
6022
6226
|
const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa();
|
6023
6227
|
|
6024
6228
|
GGML_ASSERT(kv.size == n_ctx);
|
6025
6229
|
|
6026
|
-
// compute the transposed [n_tokens, n_embd] V matrix
|
6027
|
-
assert(v_cur->ne[0] == n_embd_v_gqa && v_cur->ne[1] == n_tokens);
|
6028
|
-
struct ggml_tensor * v_cur_t = ggml_transpose(ctx, v_cur);
|
6029
|
-
cb(v_cur_t, "v_cur_t", il);
|
6030
|
-
|
6031
6230
|
struct ggml_tensor * k_cache_view = ggml_view_1d(ctx, kv.k_l[il], n_tokens*n_embd_k_gqa,
|
6032
6231
|
(ggml_row_size(kv.k_l[il]->type, n_embd_k_gqa))*kv_head);
|
6033
6232
|
cb(k_cache_view, "k_cache_view", il);
|
6034
6233
|
|
6035
|
-
|
6036
|
-
|
6037
|
-
|
6234
|
+
// note: storing RoPE-ed version of K in the KV cache
|
6235
|
+
ggml_build_forward_expand(graph, ggml_cpy(ctx, k_cur, k_cache_view));
|
6236
|
+
|
6237
|
+
assert(v_cur->ne[0] == n_embd_v_gqa && v_cur->ne[1] == n_tokens);
|
6238
|
+
|
6239
|
+
struct ggml_tensor * v_cache_view = nullptr;
|
6240
|
+
|
6241
|
+
if (cparams.flash_attn) {
|
6242
|
+
v_cache_view = ggml_view_1d(ctx, kv.v_l[il], n_tokens*n_embd_v_gqa,
|
6243
|
+
(kv_head)*ggml_row_size(kv.v_l[il]->type, n_embd_v_gqa));
|
6244
|
+
} else {
|
6245
|
+
// note: the V cache is transposed when not using flash attention
|
6246
|
+
v_cache_view = ggml_view_2d(ctx, kv.v_l[il], n_tokens, n_embd_v_gqa,
|
6247
|
+
( n_ctx)*ggml_element_size(kv.v_l[il]),
|
6248
|
+
(kv_head)*ggml_element_size(kv.v_l[il]));
|
6249
|
+
|
6250
|
+
v_cur = ggml_transpose(ctx, v_cur);
|
6251
|
+
}
|
6038
6252
|
cb(v_cache_view, "v_cache_view", il);
|
6039
6253
|
|
6040
|
-
|
6041
|
-
ggml_build_forward_expand(graph, ggml_cpy(ctx, k_cur, k_cache_view));
|
6042
|
-
ggml_build_forward_expand(graph, ggml_cpy(ctx, v_cur_t, v_cache_view));
|
6254
|
+
ggml_build_forward_expand(graph, ggml_cpy(ctx, v_cur, v_cache_view));
|
6043
6255
|
}
|
6044
6256
|
|
6045
6257
|
static struct ggml_tensor * llm_build_norm(
|
@@ -6259,11 +6471,11 @@ static struct ggml_tensor * llm_build_moe_ffn(
|
|
6259
6471
|
return moe_out;
|
6260
6472
|
}
|
6261
6473
|
|
6262
|
-
// if max_alibi_bias > 0 then apply ALiBi
|
6263
6474
|
static struct ggml_tensor * llm_build_kqv(
|
6264
6475
|
struct ggml_context * ctx,
|
6265
6476
|
const llama_model & model,
|
6266
6477
|
const llama_hparams & hparams,
|
6478
|
+
const llama_cparams & cparams,
|
6267
6479
|
const llama_kv_cache & kv,
|
6268
6480
|
struct ggml_cgraph * graph,
|
6269
6481
|
struct ggml_tensor * wo,
|
@@ -6271,12 +6483,12 @@ static struct ggml_tensor * llm_build_kqv(
|
|
6271
6483
|
struct ggml_tensor * q_cur,
|
6272
6484
|
struct ggml_tensor * kq_mask,
|
6273
6485
|
struct ggml_tensor * kq_pos,
|
6274
|
-
int64_t n_ctx,
|
6275
6486
|
int32_t n_tokens,
|
6276
6487
|
int32_t n_kv,
|
6277
6488
|
float kq_scale,
|
6278
6489
|
const llm_build_cb & cb,
|
6279
6490
|
int il) {
|
6491
|
+
const int64_t n_ctx = cparams.n_ctx;
|
6280
6492
|
const int64_t n_head = hparams.n_head;
|
6281
6493
|
const int64_t n_head_kv = hparams.n_head_kv;
|
6282
6494
|
const int64_t n_embd_head_k = hparams.n_embd_head_k;
|
@@ -6294,71 +6506,99 @@ static struct ggml_tensor * llm_build_kqv(
|
|
6294
6506
|
0);
|
6295
6507
|
cb(k, "k", il);
|
6296
6508
|
|
6297
|
-
struct ggml_tensor *
|
6298
|
-
cb(kq, "kq", il);
|
6509
|
+
struct ggml_tensor * cur;
|
6299
6510
|
|
6300
|
-
if (
|
6301
|
-
|
6302
|
-
|
6303
|
-
ggml_mul_mat_set_prec(kq, GGML_PREC_F32);
|
6304
|
-
}
|
6511
|
+
if (cparams.flash_attn) {
|
6512
|
+
GGML_UNUSED(model);
|
6513
|
+
GGML_UNUSED(n_ctx);
|
6305
6514
|
|
6306
|
-
|
6307
|
-
//
|
6308
|
-
|
6309
|
-
// and then :
|
6310
|
-
// kq = 30 * tanh(kq / 30)
|
6311
|
-
// before the softmax below
|
6515
|
+
// note: if this assert triggers, then some check has failed earlier
|
6516
|
+
// the idea is to detect during context creation that ALiBi would be used and disable Flash Attention
|
6517
|
+
GGML_ASSERT(kq_pos == nullptr && "ALiBi is not yet supported with Flash Attention");
|
6312
6518
|
|
6313
|
-
//
|
6314
|
-
|
6519
|
+
// split cached v into n_head heads (not transposed)
|
6520
|
+
struct ggml_tensor * v =
|
6521
|
+
ggml_view_3d(ctx, kv.v_l[il],
|
6522
|
+
n_embd_head_v, n_kv, n_head_kv,
|
6523
|
+
ggml_row_size(kv.v_l[il]->type, n_embd_k_gqa),
|
6524
|
+
ggml_row_size(kv.v_l[il]->type, n_embd_head_k),
|
6525
|
+
0);
|
6526
|
+
cb(v, "v", il);
|
6315
6527
|
|
6316
|
-
|
6317
|
-
|
6318
|
-
|
6528
|
+
cur = ggml_flash_attn_ext(ctx, q, k, v, kq_mask, kq_scale);
|
6529
|
+
|
6530
|
+
if (model.arch == LLM_ARCH_PHI2 || model.arch == LLM_ARCH_PHI3) {
|
6531
|
+
ggml_flash_attn_ext_set_prec(cur, GGML_PREC_F32);
|
6532
|
+
}
|
6533
|
+
|
6534
|
+
cur = ggml_reshape_2d(ctx, cur, n_embd_head_k*n_head, n_tokens);
|
6535
|
+
} else {
|
6536
|
+
struct ggml_tensor * kq = ggml_mul_mat(ctx, k, q);
|
6537
|
+
cb(kq, "kq", il);
|
6538
|
+
|
6539
|
+
if (model.arch == LLM_ARCH_PHI2 || model.arch == LLM_ARCH_PHI3) {
|
6540
|
+
// for this arch, we need to perform the KQ multiplication with F32 precision, otherwise we get NaNs
|
6541
|
+
// ref: https://github.com/ggerganov/llama.cpp/pull/4490#issuecomment-1859055847
|
6542
|
+
ggml_mul_mat_set_prec(kq, GGML_PREC_F32);
|
6543
|
+
}
|
6544
|
+
|
6545
|
+
if (model.arch == LLM_ARCH_GROK) {
|
6546
|
+
// need to do the following:
|
6547
|
+
// multiply by attn_output_multiplyer of 0.08838834764831845
|
6548
|
+
// and then :
|
6549
|
+
// kq = 30 * tanh(kq / 30)
|
6550
|
+
// before the softmax below
|
6551
|
+
|
6552
|
+
//try from phi2
|
6553
|
+
//ggml_mul_mat_set_prec(kq, GGML_PREC_F32);
|
6554
|
+
|
6555
|
+
kq = ggml_tanh(ctx, ggml_scale(ctx, kq, 0.08838834764831845f/30.0f));
|
6556
|
+
kq = ggml_scale(ctx, kq, 30);
|
6557
|
+
}
|
6319
6558
|
|
6320
6559
|
#if defined(GGML_USE_KOMPUTE)
|
6321
6560
|
#pragma message("TODO: ALiBi support in ggml_soft_max_ext is not implemented for Kompute")
|
6322
6561
|
#pragma message(" Falling back to ggml_alibi(). Will become an error in Mar 2024")
|
6323
6562
|
#pragma message("ref: https://github.com/ggerganov/llama.cpp/pull/5488")
|
6324
|
-
|
6325
|
-
|
6326
|
-
|
6563
|
+
if (hparams.use_alibi) {
|
6564
|
+
kq = ggml_scale(ctx, kq, kq_scale);
|
6565
|
+
cb(kq, "kq_scaled", il);
|
6327
6566
|
|
6328
|
-
|
6329
|
-
|
6567
|
+
kq = ggml_alibi(ctx, kq, /*n_past*/ 0, n_head, hparams.f_max_alibi_bias);
|
6568
|
+
cb(kq, "kq_scaled_alibi", il);
|
6330
6569
|
|
6331
|
-
|
6332
|
-
|
6570
|
+
kq = ggml_add(ctx, kq, kq_mask);
|
6571
|
+
cb(kq, "kq_masked", il);
|
6333
6572
|
|
6334
|
-
|
6335
|
-
|
6336
|
-
|
6573
|
+
kq = ggml_soft_max(ctx, kq);
|
6574
|
+
cb(kq, "kq_soft_max", il);
|
6575
|
+
} else
|
6337
6576
|
#endif
|
6338
|
-
|
6339
|
-
|
6340
|
-
|
6341
|
-
|
6577
|
+
{
|
6578
|
+
kq = ggml_soft_max_ext(ctx, kq, kq_mask, kq_pos, kq_scale, hparams.f_max_alibi_bias);
|
6579
|
+
cb(kq, "kq_soft_max_ext", il);
|
6580
|
+
}
|
6342
6581
|
|
6343
|
-
|
6582
|
+
GGML_ASSERT(kv.size == n_ctx);
|
6344
6583
|
|
6345
|
-
|
6346
|
-
|
6347
|
-
|
6348
|
-
|
6349
|
-
|
6350
|
-
|
6351
|
-
|
6352
|
-
|
6584
|
+
// split cached v into n_head heads
|
6585
|
+
struct ggml_tensor * v =
|
6586
|
+
ggml_view_3d(ctx, kv.v_l[il],
|
6587
|
+
n_kv, n_embd_head_v, n_head_kv,
|
6588
|
+
ggml_element_size(kv.v_l[il])*n_ctx,
|
6589
|
+
ggml_element_size(kv.v_l[il])*n_ctx*n_embd_head_v,
|
6590
|
+
0);
|
6591
|
+
cb(v, "v", il);
|
6353
6592
|
|
6354
|
-
|
6355
|
-
|
6593
|
+
struct ggml_tensor * kqv = ggml_mul_mat(ctx, v, kq);
|
6594
|
+
cb(kqv, "kqv", il);
|
6356
6595
|
|
6357
|
-
|
6358
|
-
|
6596
|
+
struct ggml_tensor * kqv_merged = ggml_permute(ctx, kqv, 0, 2, 1, 3);
|
6597
|
+
cb(kqv_merged, "kqv_merged", il);
|
6359
6598
|
|
6360
|
-
|
6361
|
-
|
6599
|
+
cur = ggml_cont_2d(ctx, kqv_merged, n_embd_head_k*n_head, n_tokens);
|
6600
|
+
cb(cur, "kqv_merged_cont", il);
|
6601
|
+
}
|
6362
6602
|
|
6363
6603
|
ggml_build_forward_expand(graph, cur);
|
6364
6604
|
|
@@ -6378,6 +6618,7 @@ static struct ggml_tensor * llm_build_kv(
|
|
6378
6618
|
struct ggml_context * ctx,
|
6379
6619
|
const llama_model & model,
|
6380
6620
|
const llama_hparams & hparams,
|
6621
|
+
const llama_cparams & cparams,
|
6381
6622
|
const llama_kv_cache & kv,
|
6382
6623
|
struct ggml_cgraph * graph,
|
6383
6624
|
struct ggml_tensor * wo,
|
@@ -6387,7 +6628,6 @@ static struct ggml_tensor * llm_build_kv(
|
|
6387
6628
|
struct ggml_tensor * q_cur,
|
6388
6629
|
struct ggml_tensor * kq_mask,
|
6389
6630
|
struct ggml_tensor * kq_pos,
|
6390
|
-
int64_t n_ctx,
|
6391
6631
|
int32_t n_tokens,
|
6392
6632
|
int32_t kv_head,
|
6393
6633
|
int32_t n_kv,
|
@@ -6401,12 +6641,12 @@ static struct ggml_tensor * llm_build_kv(
|
|
6401
6641
|
ggml_build_forward_expand(graph, k_cur);
|
6402
6642
|
ggml_build_forward_expand(graph, v_cur);
|
6403
6643
|
|
6404
|
-
llm_build_kv_store(ctx, hparams, kv, graph, k_cur, v_cur,
|
6644
|
+
llm_build_kv_store(ctx, hparams, cparams, kv, graph, k_cur, v_cur, n_tokens, kv_head, cb, il);
|
6405
6645
|
|
6406
6646
|
struct ggml_tensor * cur;
|
6407
6647
|
|
6408
|
-
cur = llm_build_kqv(ctx, model, hparams, kv, graph, wo, wo_b,
|
6409
|
-
q_cur, kq_mask, kq_pos,
|
6648
|
+
cur = llm_build_kqv(ctx, model, hparams, cparams, kv, graph, wo, wo_b,
|
6649
|
+
q_cur, kq_mask, kq_pos, n_tokens, n_kv, kq_scale, cb, il);
|
6410
6650
|
cb(cur, "kqv_out", il);
|
6411
6651
|
|
6412
6652
|
return cur;
|
@@ -6448,6 +6688,8 @@ struct llm_build_context {
|
|
6448
6688
|
const int32_t kv_head; // index of where we store new KV data in the cache
|
6449
6689
|
const int32_t n_orig_ctx;
|
6450
6690
|
|
6691
|
+
const bool flash_attn;
|
6692
|
+
|
6451
6693
|
const enum llama_pooling_type pooling_type;
|
6452
6694
|
const enum llama_rope_type rope_type;
|
6453
6695
|
|
@@ -6494,6 +6736,7 @@ struct llm_build_context {
|
|
6494
6736
|
n_outputs (worst_case ? n_tokens : lctx.n_outputs),
|
6495
6737
|
kv_head (worst_case ? (kv_self.recurrent ? 0 : kv_self.size - n_tokens) : kv_self.head),
|
6496
6738
|
n_orig_ctx (cparams.n_yarn_orig_ctx),
|
6739
|
+
flash_attn (cparams.flash_attn),
|
6497
6740
|
pooling_type (cparams.pooling_type),
|
6498
6741
|
rope_type (hparams.rope_type),
|
6499
6742
|
cb (cb),
|
@@ -6608,15 +6851,31 @@ struct llm_build_context {
|
|
6608
6851
|
ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa),
|
6609
6852
|
ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*id));
|
6610
6853
|
|
6611
|
-
ggml_tensor * view_v_src
|
6612
|
-
|
6613
|
-
ggml_row_size(kv_self.v_l[il]->type, kv_self.size),
|
6614
|
-
ggml_row_size(kv_self.v_l[il]->type, i));
|
6854
|
+
ggml_tensor * view_v_src;
|
6855
|
+
ggml_tensor * view_v_dst;
|
6615
6856
|
|
6616
|
-
|
6617
|
-
|
6618
|
-
|
6619
|
-
|
6857
|
+
if (flash_attn) {
|
6858
|
+
// NOTE: the V cache is not transposed when using flash attention
|
6859
|
+
view_v_src = ggml_view_2d(ctx0, kv_self.v_l[il],
|
6860
|
+
n_embd_v_gqa, nm,
|
6861
|
+
ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa),
|
6862
|
+
ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa*i));
|
6863
|
+
|
6864
|
+
view_v_dst = ggml_view_2d(ctx0, kv_self.v_l[il],
|
6865
|
+
n_embd_v_gqa, nm,
|
6866
|
+
ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa),
|
6867
|
+
ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa*id));
|
6868
|
+
} else {
|
6869
|
+
view_v_src = ggml_view_2d(ctx0, kv_self.v_l[il],
|
6870
|
+
nm, n_embd_v_gqa,
|
6871
|
+
ggml_row_size(kv_self.v_l[il]->type, kv_self.size),
|
6872
|
+
ggml_row_size(kv_self.v_l[il]->type, i));
|
6873
|
+
|
6874
|
+
view_v_dst = ggml_view_2d(ctx0, kv_self.v_l[il],
|
6875
|
+
nm, n_embd_v_gqa,
|
6876
|
+
ggml_row_size(kv_self.v_l[il]->type, kv_self.size),
|
6877
|
+
ggml_row_size(kv_self.v_l[il]->type, id));
|
6878
|
+
}
|
6620
6879
|
|
6621
6880
|
ggml_build_forward_expand(gf, ggml_cpy(ctx0, view_k_src, view_k_dst));
|
6622
6881
|
ggml_build_forward_expand(gf, ggml_cpy(ctx0, view_v_src, view_v_dst));
|
@@ -6646,20 +6905,26 @@ struct llm_build_context {
|
|
6646
6905
|
|
6647
6906
|
struct ggml_tensor * build_inp_KQ_mask(bool causal = true) {
|
6648
6907
|
if (causal) {
|
6649
|
-
lctx.inp_KQ_mask = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_kv,
|
6908
|
+
lctx.inp_KQ_mask = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_kv, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD));
|
6650
6909
|
} else {
|
6651
|
-
lctx.inp_KQ_mask = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_tokens, n_tokens);
|
6910
|
+
lctx.inp_KQ_mask = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_tokens, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD));
|
6652
6911
|
}
|
6653
6912
|
cb(lctx.inp_KQ_mask, "KQ_mask", -1);
|
6654
6913
|
ggml_set_input(lctx.inp_KQ_mask);
|
6655
|
-
return lctx.inp_KQ_mask;
|
6914
|
+
return flash_attn ? ggml_cast(ctx0, lctx.inp_KQ_mask, GGML_TYPE_F16) : lctx.inp_KQ_mask;
|
6656
6915
|
}
|
6657
6916
|
|
6658
|
-
struct ggml_tensor * build_inp_KQ_pos() {
|
6659
|
-
|
6917
|
+
struct ggml_tensor * build_inp_KQ_pos(bool causal = true) {
|
6918
|
+
if (causal) {
|
6919
|
+
lctx.inp_KQ_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, n_kv);
|
6920
|
+
} else {
|
6921
|
+
// TODO: this will be needed for ALiBi-based BERT models
|
6922
|
+
// https://github.com/ggerganov/llama.cpp/pull/6826
|
6923
|
+
lctx.inp_KQ_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, n_tokens);
|
6924
|
+
}
|
6660
6925
|
cb(lctx.inp_KQ_pos, "KQ_pos", -1);
|
6661
6926
|
ggml_set_input(lctx.inp_KQ_pos);
|
6662
|
-
return lctx.inp_KQ_pos;
|
6927
|
+
return flash_attn ? ggml_cast(ctx0, lctx.inp_KQ_pos, GGML_TYPE_F16) : lctx.inp_KQ_pos;
|
6663
6928
|
}
|
6664
6929
|
|
6665
6930
|
struct ggml_tensor * build_inp_mean() {
|
@@ -6765,9 +7030,9 @@ struct llm_build_context {
|
|
6765
7030
|
);
|
6766
7031
|
cb(Kcur, "Kcur", il);
|
6767
7032
|
|
6768
|
-
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
7033
|
+
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
6769
7034
|
model.layers[il].wo, model.layers[il].bo,
|
6770
|
-
Kcur, Vcur, Qcur, KQ_mask, nullptr,
|
7035
|
+
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
6771
7036
|
}
|
6772
7037
|
|
6773
7038
|
if (il == n_layer - 1) {
|
@@ -6905,9 +7170,9 @@ struct llm_build_context {
|
|
6905
7170
|
cb(Qcur, "Qcur", il);
|
6906
7171
|
cb(Kcur, "Kcur", il);
|
6907
7172
|
|
6908
|
-
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
7173
|
+
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
6909
7174
|
model.layers[il].wo, NULL,
|
6910
|
-
Kcur, Vcur, Qcur, KQ_mask, KQ_pos,
|
7175
|
+
Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
6911
7176
|
}
|
6912
7177
|
|
6913
7178
|
if (il == n_layer - 1) {
|
@@ -7012,9 +7277,9 @@ struct llm_build_context {
|
|
7012
7277
|
ext_factor, attn_factor, beta_fast, beta_slow
|
7013
7278
|
);
|
7014
7279
|
cb(Kcur, "Kcur", il);
|
7015
|
-
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
7280
|
+
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
7016
7281
|
model.layers[il].wo, NULL,
|
7017
|
-
Kcur, Vcur, Qcur, KQ_mask, KQ_pos,
|
7282
|
+
Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
7018
7283
|
}
|
7019
7284
|
|
7020
7285
|
if (il == n_layer - 1) {
|
@@ -7132,9 +7397,9 @@ struct llm_build_context {
|
|
7132
7397
|
);
|
7133
7398
|
cb(Kcur, "Kcur", il);
|
7134
7399
|
|
7135
|
-
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
7400
|
+
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
7136
7401
|
model.layers[il].wo, NULL,
|
7137
|
-
Kcur, Vcur, Qcur, KQ_mask, nullptr,
|
7402
|
+
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
7138
7403
|
}
|
7139
7404
|
|
7140
7405
|
if (il == n_layer - 1) {
|
@@ -7257,9 +7522,9 @@ struct llm_build_context {
|
|
7257
7522
|
);
|
7258
7523
|
cb(Kcur, "Kcur", il);
|
7259
7524
|
|
7260
|
-
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
7525
|
+
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
7261
7526
|
model.layers[il].wo, model.layers[il].bo,
|
7262
|
-
Kcur, Vcur, Qcur, KQ_mask, nullptr,
|
7527
|
+
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f, cb, il);
|
7263
7528
|
}
|
7264
7529
|
|
7265
7530
|
if (il == n_layer - 1) {
|
@@ -7409,9 +7674,9 @@ struct llm_build_context {
|
|
7409
7674
|
);
|
7410
7675
|
cb(Kcur, "Kcur", il);
|
7411
7676
|
|
7412
|
-
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
7413
|
-
|
7414
|
-
|
7677
|
+
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
7678
|
+
model.layers[il].wo, NULL,
|
7679
|
+
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
7415
7680
|
}
|
7416
7681
|
|
7417
7682
|
if (il == n_layer - 1) {
|
@@ -7521,9 +7786,9 @@ struct llm_build_context {
|
|
7521
7786
|
|
7522
7787
|
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
7523
7788
|
|
7524
|
-
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
7789
|
+
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
7525
7790
|
model.layers[il].wo, model.layers[il].bo,
|
7526
|
-
Kcur, Vcur, Qcur, KQ_mask, nullptr,
|
7791
|
+
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
7527
7792
|
}
|
7528
7793
|
|
7529
7794
|
if (il == n_layer - 1) {
|
@@ -7725,9 +7990,9 @@ struct llm_build_context {
|
|
7725
7990
|
);
|
7726
7991
|
cb(Vcur, "Vcur", il);
|
7727
7992
|
|
7728
|
-
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
7993
|
+
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
7729
7994
|
model.layers[il].wo, model.layers[il].bo,
|
7730
|
-
Kcur, Vcur, Q, KQ_mask, nullptr,
|
7995
|
+
Kcur, Vcur, Q, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
7731
7996
|
}
|
7732
7997
|
|
7733
7998
|
if (il == n_layer - 1) {
|
@@ -7821,9 +8086,9 @@ struct llm_build_context {
|
|
7821
8086
|
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
7822
8087
|
cb(Qcur, "Qcur", il);
|
7823
8088
|
|
7824
|
-
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
8089
|
+
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
7825
8090
|
model.layers[il].wo, NULL,
|
7826
|
-
Kcur, Vcur, Qcur, KQ_mask, KQ_pos,
|
8091
|
+
Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
7827
8092
|
}
|
7828
8093
|
|
7829
8094
|
if (il == n_layer - 1) {
|
@@ -8114,9 +8379,9 @@ struct llm_build_context {
|
|
8114
8379
|
|
8115
8380
|
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
8116
8381
|
|
8117
|
-
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
8382
|
+
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
8118
8383
|
model.layers[il].wo, model.layers[il].bo,
|
8119
|
-
Kcur, Vcur, Qcur, KQ_mask, KQ_pos,
|
8384
|
+
Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
8120
8385
|
}
|
8121
8386
|
|
8122
8387
|
if (il == n_layer - 1) {
|
@@ -8245,14 +8510,15 @@ struct llm_build_context {
|
|
8245
8510
|
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
8246
8511
|
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
8247
8512
|
|
8248
|
-
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
8249
|
-
|
8250
|
-
|
8513
|
+
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
8514
|
+
model.layers[il].wo, model.layers[il].bo,
|
8515
|
+
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
8251
8516
|
} else {
|
8252
8517
|
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
8253
|
-
|
8518
|
+
|
8519
|
+
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
8254
8520
|
model.layers[il].wo, model.layers[il].bo,
|
8255
|
-
Kcur, Vcur, Qcur, KQ_mask, KQ_pos,
|
8521
|
+
Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
8256
8522
|
}
|
8257
8523
|
}
|
8258
8524
|
|
@@ -8394,9 +8660,9 @@ struct llm_build_context {
|
|
8394
8660
|
);
|
8395
8661
|
cb(Kcur, "Kcur", il);
|
8396
8662
|
|
8397
|
-
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
8663
|
+
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
8398
8664
|
model.layers[il].wo, NULL,
|
8399
|
-
Kcur, Vcur, Qcur, KQ_mask, nullptr,
|
8665
|
+
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
8400
8666
|
}
|
8401
8667
|
|
8402
8668
|
if (il == n_layer - 1) {
|
@@ -8512,9 +8778,9 @@ struct llm_build_context {
|
|
8512
8778
|
);
|
8513
8779
|
cb(Kcur, "Kcur", il);
|
8514
8780
|
|
8515
|
-
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
8781
|
+
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
8516
8782
|
model.layers[il].wo, NULL,
|
8517
|
-
Kcur, Vcur, Qcur, KQ_mask, nullptr,
|
8783
|
+
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
8518
8784
|
}
|
8519
8785
|
|
8520
8786
|
if (il == n_layer - 1) {
|
@@ -8625,9 +8891,9 @@ struct llm_build_context {
|
|
8625
8891
|
);
|
8626
8892
|
cb(Kcur, "Kcur", il);
|
8627
8893
|
|
8628
|
-
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
8894
|
+
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
8629
8895
|
model.layers[il].wo, model.layers[il].bo,
|
8630
|
-
Kcur, Vcur, Qcur, KQ_mask, nullptr,
|
8896
|
+
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
8631
8897
|
}
|
8632
8898
|
|
8633
8899
|
if (il == n_layer - 1) {
|
@@ -8739,9 +9005,9 @@ struct llm_build_context {
|
|
8739
9005
|
);
|
8740
9006
|
cb(Kcur, "Kcur", il);
|
8741
9007
|
|
8742
|
-
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
9008
|
+
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
8743
9009
|
model.layers[il].wo, model.layers[il].bo,
|
8744
|
-
Kcur, Vcur, Qcur, KQ_mask, nullptr,
|
9010
|
+
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
8745
9011
|
}
|
8746
9012
|
|
8747
9013
|
if (il == n_layer - 1) {
|
@@ -8894,9 +9160,9 @@ struct llm_build_context {
|
|
8894
9160
|
);
|
8895
9161
|
cb(Kcur, "Kcur", il);
|
8896
9162
|
|
8897
|
-
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
9163
|
+
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
8898
9164
|
model.layers[il].wo, model.layers[il].bo,
|
8899
|
-
Kcur, Vcur, Qcur, KQ_mask, nullptr,
|
9165
|
+
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f, cb, il);
|
8900
9166
|
}
|
8901
9167
|
|
8902
9168
|
if (il == n_layer - 1) {
|
@@ -8938,12 +9204,140 @@ struct llm_build_context {
|
|
8938
9204
|
|
8939
9205
|
cur = ggml_add(ctx0, cur, model.output_b);
|
8940
9206
|
cb(cur, "result_output", -1);
|
9207
|
+
ggml_build_forward_expand(gf, cur);
|
9208
|
+
return gf;
|
9209
|
+
}
|
9210
|
+
|
9211
|
+
struct ggml_cgraph * build_phi3() {
|
9212
|
+
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
9213
|
+
|
9214
|
+
const int64_t n_embd_head = hparams.n_embd_head_v;
|
9215
|
+
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
|
9216
|
+
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
9217
|
+
|
9218
|
+
struct ggml_tensor * cur;
|
9219
|
+
struct ggml_tensor * inpL;
|
9220
|
+
|
9221
|
+
inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
|
9222
|
+
|
9223
|
+
// inp_pos - contains the positions
|
9224
|
+
struct ggml_tensor * inp_pos = build_inp_pos();
|
9225
|
+
|
9226
|
+
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
9227
|
+
struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
|
9228
|
+
|
9229
|
+
for (int il = 0; il < n_layer; ++il) {
|
9230
|
+
auto residual = inpL;
|
9231
|
+
|
9232
|
+
// self-attention
|
9233
|
+
{
|
9234
|
+
struct ggml_tensor* attn_norm_output = llm_build_norm(ctx0, inpL, hparams,
|
9235
|
+
model.layers[il].attn_norm,
|
9236
|
+
NULL,
|
9237
|
+
LLM_NORM_RMS, cb, il);
|
9238
|
+
cb(attn_norm_output, "attn_norm", il);
|
9239
|
+
|
9240
|
+
struct ggml_tensor * Qcur = nullptr;
|
9241
|
+
struct ggml_tensor * Kcur = nullptr;
|
9242
|
+
struct ggml_tensor * Vcur = nullptr;
|
9243
|
+
|
9244
|
+
if (model.layers[il].wqkv) {
|
9245
|
+
cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, attn_norm_output);
|
9246
|
+
cb(cur, "wqkv", il);
|
9247
|
+
|
9248
|
+
Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0 * sizeof(float) * (n_embd)));
|
9249
|
+
Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1 * sizeof(float) * (n_embd)));
|
9250
|
+
Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1 * sizeof(float) * (n_embd + n_embd_gqa)));
|
9251
|
+
}
|
9252
|
+
else {
|
9253
|
+
Qcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wq, attn_norm_output), model.layers[il].bq);
|
9254
|
+
Kcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wk, attn_norm_output), model.layers[il].bk);
|
9255
|
+
Vcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wv, attn_norm_output), model.layers[il].bv);
|
9256
|
+
}
|
9257
|
+
|
9258
|
+
cb(Qcur, "Qcur", il);
|
9259
|
+
cb(Kcur, "Kcur", il);
|
9260
|
+
cb(Vcur, "Vcur", il);
|
9261
|
+
|
9262
|
+
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
9263
|
+
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
9264
|
+
|
9265
|
+
Qcur = ggml_rope_custom(
|
9266
|
+
ctx0, Qcur, inp_pos, n_rot, rope_type, 0, n_orig_ctx,
|
9267
|
+
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
9268
|
+
);
|
9269
|
+
cb(Qcur, "Qcur", il);
|
9270
|
+
|
9271
|
+
Qcur = ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd_head)));
|
9272
|
+
cb(Qcur, "Qcur", il);
|
9273
|
+
|
9274
|
+
Kcur = ggml_rope_custom(
|
9275
|
+
ctx0, Kcur, inp_pos, n_rot, rope_type, 0, n_orig_ctx,
|
9276
|
+
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
9277
|
+
);
|
9278
|
+
cb(Kcur, "Kcur", il);
|
9279
|
+
|
9280
|
+
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
9281
|
+
model.layers[il].wo, model.layers[il].bo,
|
9282
|
+
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f, cb, il);
|
9283
|
+
}
|
9284
|
+
|
9285
|
+
if (il == n_layer - 1) {
|
9286
|
+
// skip computing output for unused tokens
|
9287
|
+
struct ggml_tensor* inp_out_ids = build_inp_out_ids();
|
9288
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
9289
|
+
residual = ggml_get_rows(ctx0, residual, inp_out_ids);
|
9290
|
+
}
|
9291
|
+
|
9292
|
+
cur = ggml_add(ctx0, cur, residual);
|
9293
|
+
residual = cur;
|
9294
|
+
|
9295
|
+
cur = llm_build_norm(ctx0, cur, hparams,
|
9296
|
+
model.layers[il].ffn_norm, NULL,
|
9297
|
+
LLM_NORM_RMS, cb, il);
|
9298
|
+
cb(cur, "ffn_norm", il);
|
9299
|
+
|
9300
|
+
// FF
|
9301
|
+
// special-case: the up and gate tensors are merged into a single tensor
|
9302
|
+
// TOOD: support into llm_build_ffn
|
9303
|
+
{
|
9304
|
+
struct ggml_tensor* up = ggml_mul_mat(ctx0, model.layers[il].ffn_up, cur);
|
9305
|
+
cb(up, "ffn_up", il);
|
9306
|
+
|
9307
|
+
auto g = ggml_cont(ctx0, ggml_view_2d(ctx0, up, up->ne[0] / 2, up->ne[1], ggml_row_size(up->type, up->ne[0]), 0));
|
9308
|
+
auto y = ggml_cont(ctx0, ggml_view_2d(ctx0, up, up->ne[0] / 2, up->ne[1], ggml_row_size(up->type, up->ne[0]), up->nb[1] / 2));
|
9309
|
+
|
9310
|
+
y = ggml_mul(ctx0, y, ggml_silu(ctx0, g));
|
9311
|
+
cb(y, "ffn_gate", il);
|
9312
|
+
|
9313
|
+
auto down = ggml_mul_mat(ctx0, model.layers[il].ffn_down, y);
|
9314
|
+
cb(down, "ffn_down", il);
|
9315
|
+
|
9316
|
+
cur = down;
|
9317
|
+
cb(cur, "ffn_out", il);
|
9318
|
+
}
|
9319
|
+
|
9320
|
+
cur = ggml_add(ctx0, residual, cur);
|
9321
|
+
cb(cur, "l_out", il);
|
9322
|
+
|
9323
|
+
inpL = cur;
|
9324
|
+
}
|
9325
|
+
|
9326
|
+
cur = llm_build_norm(ctx0, inpL, hparams,
|
9327
|
+
model.output_norm,
|
9328
|
+
NULL,
|
9329
|
+
LLM_NORM_RMS, cb, -1);
|
9330
|
+
cb(cur, "result_norm", -1);
|
9331
|
+
|
9332
|
+
cur = ggml_mul_mat(ctx0, model.output, cur);
|
9333
|
+
cb(cur, "result_output", -1);
|
8941
9334
|
|
8942
9335
|
ggml_build_forward_expand(gf, cur);
|
8943
9336
|
|
8944
9337
|
return gf;
|
8945
9338
|
}
|
8946
9339
|
|
9340
|
+
|
8947
9341
|
struct ggml_cgraph * build_plamo() {
|
8948
9342
|
struct ggml_cgraph * gf = ggml_new_graph(ctx0);
|
8949
9343
|
|
@@ -8996,9 +9390,9 @@ struct llm_build_context {
|
|
8996
9390
|
ext_factor, attn_factor, beta_fast, beta_slow);
|
8997
9391
|
cb(Kcur, "Kcur", il);
|
8998
9392
|
|
8999
|
-
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
9393
|
+
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
9000
9394
|
model.layers[il].wo, NULL,
|
9001
|
-
Kcur, Vcur, Qcur, KQ_mask, nullptr,
|
9395
|
+
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
9002
9396
|
}
|
9003
9397
|
struct ggml_tensor * sa_out = cur;
|
9004
9398
|
|
@@ -9099,9 +9493,9 @@ struct llm_build_context {
|
|
9099
9493
|
|
9100
9494
|
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
9101
9495
|
|
9102
|
-
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
9496
|
+
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
9103
9497
|
model.layers[il].wo, model.layers[il].bo,
|
9104
|
-
Kcur, Vcur, Qcur, KQ_mask, nullptr,
|
9498
|
+
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
9105
9499
|
}
|
9106
9500
|
|
9107
9501
|
if (il == n_layer - 1) {
|
@@ -9206,9 +9600,9 @@ struct llm_build_context {
|
|
9206
9600
|
);
|
9207
9601
|
cb(Kcur, "Kcur", il);
|
9208
9602
|
|
9209
|
-
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
9603
|
+
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
9210
9604
|
model.layers[il].wo, model.layers[il].bo,
|
9211
|
-
Kcur, Vcur, Qcur, KQ_mask, nullptr,
|
9605
|
+
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
9212
9606
|
}
|
9213
9607
|
|
9214
9608
|
if (il == n_layer - 1) {
|
@@ -9322,9 +9716,9 @@ struct llm_build_context {
|
|
9322
9716
|
);
|
9323
9717
|
cb(Kcur, "Kcur", il);
|
9324
9718
|
|
9325
|
-
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
9719
|
+
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
9326
9720
|
model.layers[il].wo, NULL,
|
9327
|
-
Kcur, Vcur, Qcur, KQ_mask, nullptr,
|
9721
|
+
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
9328
9722
|
}
|
9329
9723
|
|
9330
9724
|
if (il == n_layer - 1) {
|
@@ -9439,9 +9833,9 @@ struct llm_build_context {
|
|
9439
9833
|
);
|
9440
9834
|
cb(Kcur, "Kcur", il);
|
9441
9835
|
|
9442
|
-
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
9836
|
+
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
9443
9837
|
model.layers[il].wo, model.layers[il].bo,
|
9444
|
-
Kcur, Vcur, Qcur, KQ_mask, nullptr,
|
9838
|
+
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
9445
9839
|
}
|
9446
9840
|
|
9447
9841
|
if (il == n_layer - 1) {
|
@@ -9569,9 +9963,9 @@ struct llm_build_context {
|
|
9569
9963
|
);
|
9570
9964
|
cb(Kcur, "Kcur", il);
|
9571
9965
|
|
9572
|
-
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
9966
|
+
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
9573
9967
|
model.layers[il].wo, model.layers[il].bo,
|
9574
|
-
Kcur, Vcur, Qcur, KQ_mask, nullptr,
|
9968
|
+
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
9575
9969
|
}
|
9576
9970
|
|
9577
9971
|
if (il == n_layer - 1) {
|
@@ -9690,9 +10084,9 @@ struct llm_build_context {
|
|
9690
10084
|
ext_factor, attn_factor, beta_fast, beta_slow);
|
9691
10085
|
cb(Kcur, "Kcur", il);
|
9692
10086
|
|
9693
|
-
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
10087
|
+
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
9694
10088
|
model.layers[il].wo, NULL,
|
9695
|
-
Kcur, Vcur, Qcur, KQ_mask, nullptr,
|
10089
|
+
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f, cb, il);
|
9696
10090
|
}
|
9697
10091
|
|
9698
10092
|
if (il == n_layer - 1) {
|
@@ -9809,9 +10203,9 @@ struct llm_build_context {
|
|
9809
10203
|
);
|
9810
10204
|
cb(Kcur, "Kcur", il);
|
9811
10205
|
|
9812
|
-
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
10206
|
+
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
9813
10207
|
model.layers[il].wo, model.layers[il].bo,
|
9814
|
-
Kcur, Vcur, Qcur, KQ_mask, nullptr,
|
10208
|
+
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
9815
10209
|
}
|
9816
10210
|
|
9817
10211
|
if (il == n_layer - 1) {
|
@@ -10099,9 +10493,9 @@ struct llm_build_context {
|
|
10099
10493
|
);
|
10100
10494
|
cb(Kcur, "Kcur", il);
|
10101
10495
|
|
10102
|
-
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
10496
|
+
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
10103
10497
|
model.layers[il].wo, model.layers[il].bo,
|
10104
|
-
Kcur, Vcur, Qcur, KQ_mask, nullptr,
|
10498
|
+
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
10105
10499
|
}
|
10106
10500
|
|
10107
10501
|
if (il == n_layer - 1) {
|
@@ -10230,9 +10624,9 @@ struct llm_build_context {
|
|
10230
10624
|
);
|
10231
10625
|
cb(Kcur, "Kcur", il);
|
10232
10626
|
|
10233
|
-
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
10627
|
+
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
10234
10628
|
model.layers[il].wo, nullptr,
|
10235
|
-
Kcur, Vcur, Qcur, KQ_mask, nullptr,
|
10629
|
+
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
10236
10630
|
}
|
10237
10631
|
|
10238
10632
|
if (il == n_layer - 1) {
|
@@ -10445,6 +10839,10 @@ static struct ggml_cgraph * llama_build_graph(
|
|
10445
10839
|
{
|
10446
10840
|
result = llm.build_phi2();
|
10447
10841
|
} break;
|
10842
|
+
case LLM_ARCH_PHI3:
|
10843
|
+
{
|
10844
|
+
result = llm.build_phi3();
|
10845
|
+
} break;
|
10448
10846
|
case LLM_ARCH_PLAMO:
|
10449
10847
|
{
|
10450
10848
|
result = llm.build_plamo();
|
@@ -10655,7 +11053,9 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
|
|
10655
11053
|
}
|
10656
11054
|
}
|
10657
11055
|
|
10658
|
-
|
11056
|
+
// ALiBi requires the KQ_pos tensor to provide the sequence position of each token in the batch
|
11057
|
+
// this allows to process multiple sequences in parallel with ALiBi-based models
|
11058
|
+
if (hparams.use_alibi) {
|
10659
11059
|
const int64_t n_kv = kv_self.n;
|
10660
11060
|
|
10661
11061
|
GGML_ASSERT(lctx.inp_KQ_pos);
|
@@ -11037,7 +11437,7 @@ static int llama_decode_internal(
|
|
11037
11437
|
// a heuristic, to avoid attending the full cache if it is not yet utilized
|
11038
11438
|
// after enough generations, the benefit from this heuristic disappears
|
11039
11439
|
// if we start defragmenting the cache, the benefit from this will be more important
|
11040
|
-
kv_self.n = std::min(kv_self.size, std::max(
|
11440
|
+
kv_self.n = std::min(kv_self.size, std::max(256u, GGML_PAD(llama_kv_cache_cell_max(kv_self), 256)));
|
11041
11441
|
//kv_self.n = llama_kv_cache_cell_max(kv_self);
|
11042
11442
|
}
|
11043
11443
|
}
|
@@ -11205,6 +11605,10 @@ static int llama_decode_internal(
|
|
11205
11605
|
}
|
11206
11606
|
}
|
11207
11607
|
|
11608
|
+
// Reset state for the next token before backend sync, to allow the CPU activities in the reset to
|
11609
|
+
// overlap with device computation.
|
11610
|
+
ggml_backend_sched_reset(lctx.sched);
|
11611
|
+
|
11208
11612
|
return 0;
|
11209
11613
|
}
|
11210
11614
|
|
@@ -11230,7 +11634,9 @@ static void llama_kv_cache_defrag_internal(struct llama_context & lctx) {
|
|
11230
11634
|
// each move requires 6*n_layer tensors (see build_defrag)
|
11231
11635
|
// - source view, destination view, copy operation
|
11232
11636
|
// - x2 for keys and values
|
11233
|
-
const uint32_t max_moves = LLAMA_MAX_NODES/(6*n_layer);
|
11637
|
+
//const uint32_t max_moves = LLAMA_MAX_NODES/(6*n_layer);
|
11638
|
+
// TODO: tmp fix https://github.com/ggerganov/llama.cpp/issues/6685#issuecomment-2057579516
|
11639
|
+
const uint32_t max_moves = (LLAMA_MAX_NODES - 2*n_layer)/(6*n_layer);
|
11234
11640
|
|
11235
11641
|
// determine which KV cells to move where
|
11236
11642
|
//
|
@@ -11554,7 +11960,7 @@ static uint8_t llama_token_to_byte(const llama_vocab& vocab, llama_token id) {
|
|
11554
11960
|
}
|
11555
11961
|
case LLAMA_VOCAB_TYPE_BPE: {
|
11556
11962
|
GGML_ASSERT(false);
|
11557
|
-
return unicode_utf8_to_byte(token_data.text);
|
11963
|
+
return unicode_utf8_to_byte(token_data.text); // TODO: why is this here after GGML_ASSERT?
|
11558
11964
|
}
|
11559
11965
|
case LLAMA_VOCAB_TYPE_WPM: {
|
11560
11966
|
GGML_ASSERT(false);
|
@@ -11776,7 +12182,79 @@ struct llm_tokenizer_bpe {
|
|
11776
12182
|
|
11777
12183
|
void tokenize(const std::string & text, std::vector<llama_vocab::id> & output) {
|
11778
12184
|
int final_prev_index = -1;
|
11779
|
-
|
12185
|
+
|
12186
|
+
std::vector<std::string> word_collection;
|
12187
|
+
switch (vocab.type) {
|
12188
|
+
case LLAMA_VOCAB_TYPE_BPE:
|
12189
|
+
switch (vocab.type_pre) {
|
12190
|
+
case LLAMA_VOCAB_PRE_TYPE_LLAMA3:
|
12191
|
+
word_collection = unicode_regex_split(text, {
|
12192
|
+
// original regex from tokenizer.json
|
12193
|
+
//"(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
|
12194
|
+
|
12195
|
+
// adapted: https://github.com/ggerganov/llama.cpp/pull/6920#issuecomment-2080233989
|
12196
|
+
"(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
|
12197
|
+
});
|
12198
|
+
break;
|
12199
|
+
case LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM:
|
12200
|
+
word_collection = unicode_regex_split(text, {
|
12201
|
+
"[\r\n]",
|
12202
|
+
"\\s?[A-Za-zµÀ-ÖØ-öø-ƺƼ-ƿDŽ-ʓʕ-ʯͰ-ͳͶͷͻ-ͽͿΆΈ-ΊΌΎ-ΡΣ-ϵϷ-ҁҊ-ԯԱ-ՖႠ-ჅᎠ-Ᏽᏸ-ᏽᲐ-ᲺᲽ-Ჿᴀ-ᴫᵫ-ᵷᵹ-ᶚḀ-ἕἘ-Ἕἠ-ὅὈ-Ὅὐ-ὗὙὛὝὟ-ώᾀ-ᾴᾶ-ᾼιῂ-ῄῆ-ῌῐ-ΐῖ-Ίῠ-Ῥῲ-ῴῶ-ῼℂℇℊ-ℓℕℙ-ℝℤΩℨK-ℭℯ-ℴℹℼ-ℿⅅ-ⅉⅎↃↄⰀ-ⱻⱾ-ⳤⳫ-ⳮⳲⳳꙀ-ꙭꚀ-ꚛꜢ-ꝯꝱ-ꞇꞋ-ꞎꭰ-ꮿff-stﬓ-ﬗA-Za-z𐐀-𐑏𐒰-𐓓𐓘-𐓻𐲀-𐲲𐳀-𐳲𑢠-𑣟𞤀-𞥃]+",
|
12203
|
+
"\\s?[!-/:-~!-/:-~‘-‟ -。]+",
|
12204
|
+
"\\s+$",
|
12205
|
+
"[一-龥ࠀ-一가-]+",
|
12206
|
+
"\\p{N}+",
|
12207
|
+
});
|
12208
|
+
break;
|
12209
|
+
case LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER:
|
12210
|
+
word_collection = unicode_regex_split(text, {
|
12211
|
+
"[\r\n]",
|
12212
|
+
"\\s?\\p{L}+",
|
12213
|
+
"\\s?\\p{P}+",
|
12214
|
+
"[一-龥ࠀ-一가-]+",
|
12215
|
+
"\\p{N}+",
|
12216
|
+
});
|
12217
|
+
break;
|
12218
|
+
case LLAMA_VOCAB_PRE_TYPE_FALCON:
|
12219
|
+
word_collection = unicode_regex_split(text, {
|
12220
|
+
"[\\p{P}\\$\\+<=>\\^~\\|]+",
|
12221
|
+
"'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
|
12222
|
+
"\\p{N}+",
|
12223
|
+
"[0-9][0-9][0-9]",
|
12224
|
+
});
|
12225
|
+
break;
|
12226
|
+
case LLAMA_VOCAB_PRE_TYPE_MPT:
|
12227
|
+
// TODO: MPT pre-tokenization regexes are unknown
|
12228
|
+
// the following are close, but not exact. run the following:
|
12229
|
+
// ./bin/test-tokenizer-0 ../models/ggml-vocab-mpt.gguf
|
12230
|
+
GGML_ASSERT("MPT pre-tokenization regexes are unknown - fixes needed");
|
12231
|
+
word_collection = unicode_regex_split(text, {
|
12232
|
+
"\\s?\\p{L}+",
|
12233
|
+
"\\s?\\p{P}+",
|
12234
|
+
"'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
|
12235
|
+
});
|
12236
|
+
break;
|
12237
|
+
case LLAMA_VOCAB_PRE_TYPE_STARCODER:
|
12238
|
+
case LLAMA_VOCAB_PRE_TYPE_GPT2:
|
12239
|
+
word_collection = unicode_regex_split(text, {
|
12240
|
+
"'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
|
12241
|
+
});
|
12242
|
+
break;
|
12243
|
+
default:
|
12244
|
+
// default regex for BPE tokenization pre-processing
|
12245
|
+
word_collection = unicode_regex_split(text, {
|
12246
|
+
"[\\p{P}\\$\\+<=>\\^~\\|]+",
|
12247
|
+
"'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
|
12248
|
+
"\\p{N}+",
|
12249
|
+
"[0-9][0-9][0-9]",
|
12250
|
+
});
|
12251
|
+
break;
|
12252
|
+
}
|
12253
|
+
break;
|
12254
|
+
default:
|
12255
|
+
GGML_ASSERT(false);
|
12256
|
+
break;
|
12257
|
+
}
|
11780
12258
|
|
11781
12259
|
symbols_final.clear();
|
11782
12260
|
|
@@ -11903,145 +12381,6 @@ private:
|
|
11903
12381
|
work_queue.push(bigram);
|
11904
12382
|
}
|
11905
12383
|
|
11906
|
-
std::vector<std::string> bpe_gpt2_preprocess(const std::string & text) {
|
11907
|
-
std::vector<std::string> bpe_words;
|
11908
|
-
std::vector<std::string> bpe_encoded_words;
|
11909
|
-
|
11910
|
-
std::string token = "";
|
11911
|
-
// GPT2 system regex: 's|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+
|
11912
|
-
bool collecting_numeric = false;
|
11913
|
-
bool collecting_letter = false;
|
11914
|
-
bool collecting_special = false;
|
11915
|
-
bool collecting_whitespace_lookahead = false;
|
11916
|
-
bool collecting = false;
|
11917
|
-
|
11918
|
-
std::vector<std::string> text_utf;
|
11919
|
-
text_utf.reserve(text.size());
|
11920
|
-
bpe_words.reserve(text.size());
|
11921
|
-
bpe_encoded_words.reserve(text.size());
|
11922
|
-
|
11923
|
-
const auto cpts = unicode_cpts_from_utf8(text);
|
11924
|
-
for (size_t i = 0; i < cpts.size(); ++i)
|
11925
|
-
text_utf.emplace_back(unicode_cpt_to_utf8(cpts[i]));
|
11926
|
-
|
11927
|
-
for (int i = 0; i < (int)text_utf.size(); i++) {
|
11928
|
-
const std::string & utf_char = text_utf[i];
|
11929
|
-
bool split_condition = false;
|
11930
|
-
int bytes_remain = text_utf.size() - i;
|
11931
|
-
// forward backward lookups
|
11932
|
-
const std::string & utf_char_next = (i + 1 < (int)text_utf.size()) ? text_utf[i + 1] : "";
|
11933
|
-
const std::string & utf_char_next_next = (i + 2 < (int)text_utf.size()) ? text_utf[i + 2] : "";
|
11934
|
-
|
11935
|
-
// handling contractions
|
11936
|
-
if (!split_condition && bytes_remain >= 2) {
|
11937
|
-
// 's|'t|'m|'d
|
11938
|
-
if (utf_char == "\'" && (utf_char_next == "s" || utf_char_next == "t" || utf_char_next == "m" || utf_char_next == "d")) {
|
11939
|
-
split_condition = true;
|
11940
|
-
}
|
11941
|
-
if (split_condition) {
|
11942
|
-
if (token.size()) {
|
11943
|
-
bpe_words.emplace_back(token); // push previous content as token
|
11944
|
-
}
|
11945
|
-
token = utf_char + utf_char_next;
|
11946
|
-
bpe_words.emplace_back(token);
|
11947
|
-
token = "";
|
11948
|
-
i++;
|
11949
|
-
continue;
|
11950
|
-
}
|
11951
|
-
}
|
11952
|
-
if (!split_condition && bytes_remain >= 3) {
|
11953
|
-
// 're|'ve|'ll
|
11954
|
-
if (utf_char == "\'" && (
|
11955
|
-
(utf_char_next == "r" && utf_char_next_next == "e") ||
|
11956
|
-
(utf_char_next == "v" && utf_char_next_next == "e") ||
|
11957
|
-
(utf_char_next == "l" && utf_char_next_next == "l"))
|
11958
|
-
) {
|
11959
|
-
split_condition = true;
|
11960
|
-
}
|
11961
|
-
if (split_condition) {
|
11962
|
-
// current token + next token can be defined
|
11963
|
-
if (token.size()) {
|
11964
|
-
bpe_words.emplace_back(token); // push previous content as token
|
11965
|
-
}
|
11966
|
-
token = utf_char + utf_char_next + utf_char_next_next;
|
11967
|
-
bpe_words.emplace_back(token); // the contraction
|
11968
|
-
token = "";
|
11969
|
-
i += 2;
|
11970
|
-
continue;
|
11971
|
-
}
|
11972
|
-
}
|
11973
|
-
|
11974
|
-
if (!split_condition && !collecting) {
|
11975
|
-
if (unicode_cpt_type(utf_char) == CODEPOINT_TYPE_LETTER || (!token.size() && utf_char == " " && unicode_cpt_type(utf_char_next) == CODEPOINT_TYPE_LETTER)) {
|
11976
|
-
collecting_letter = true;
|
11977
|
-
collecting = true;
|
11978
|
-
}
|
11979
|
-
else if (unicode_cpt_type(utf_char) == CODEPOINT_TYPE_DIGIT || (!token.size() && utf_char == " " && unicode_cpt_type(utf_char_next) == CODEPOINT_TYPE_DIGIT)) {
|
11980
|
-
collecting_numeric = true;
|
11981
|
-
collecting = true;
|
11982
|
-
}
|
11983
|
-
else if (
|
11984
|
-
((unicode_cpt_type(utf_char) != CODEPOINT_TYPE_LETTER && unicode_cpt_type(utf_char) != CODEPOINT_TYPE_DIGIT) && (unicode_cpt_type(utf_char) != CODEPOINT_TYPE_WHITESPACE)) ||
|
11985
|
-
(!token.size() && utf_char == " " && unicode_cpt_type(utf_char_next) != CODEPOINT_TYPE_LETTER && unicode_cpt_type(utf_char_next) != CODEPOINT_TYPE_DIGIT && unicode_cpt_type(utf_char_next) != CODEPOINT_TYPE_WHITESPACE)
|
11986
|
-
) {
|
11987
|
-
collecting_special = true;
|
11988
|
-
collecting = true;
|
11989
|
-
}
|
11990
|
-
else if (unicode_cpt_type(utf_char) == CODEPOINT_TYPE_WHITESPACE && unicode_cpt_type(utf_char_next) == CODEPOINT_TYPE_WHITESPACE) {
|
11991
|
-
collecting_whitespace_lookahead = true;
|
11992
|
-
collecting = true;
|
11993
|
-
}
|
11994
|
-
else if (unicode_cpt_type(utf_char) == CODEPOINT_TYPE_WHITESPACE) {
|
11995
|
-
split_condition = true;
|
11996
|
-
}
|
11997
|
-
}
|
11998
|
-
else if (!split_condition && collecting) {
|
11999
|
-
if (collecting_letter && unicode_cpt_type(utf_char) != CODEPOINT_TYPE_LETTER) {
|
12000
|
-
split_condition = true;
|
12001
|
-
}
|
12002
|
-
else if (collecting_numeric && unicode_cpt_type(utf_char) != CODEPOINT_TYPE_DIGIT) {
|
12003
|
-
split_condition = true;
|
12004
|
-
}
|
12005
|
-
else if (collecting_special && (unicode_cpt_type(utf_char) == CODEPOINT_TYPE_LETTER || unicode_cpt_type(utf_char) == CODEPOINT_TYPE_DIGIT || unicode_cpt_type(utf_char) == CODEPOINT_TYPE_WHITESPACE)) {
|
12006
|
-
split_condition = true;
|
12007
|
-
}
|
12008
|
-
else if (collecting_whitespace_lookahead && (unicode_cpt_type(utf_char_next) == CODEPOINT_TYPE_LETTER || unicode_cpt_type(utf_char_next) == CODEPOINT_TYPE_DIGIT)) {
|
12009
|
-
split_condition = true;
|
12010
|
-
}
|
12011
|
-
}
|
12012
|
-
|
12013
|
-
if (utf_char_next == "") {
|
12014
|
-
split_condition = true; // final
|
12015
|
-
token += utf_char;
|
12016
|
-
}
|
12017
|
-
|
12018
|
-
if (split_condition) {
|
12019
|
-
if (token.size()) {
|
12020
|
-
bpe_words.emplace_back(token);
|
12021
|
-
}
|
12022
|
-
token = utf_char;
|
12023
|
-
collecting = false;
|
12024
|
-
collecting_letter = false;
|
12025
|
-
collecting_numeric = false;
|
12026
|
-
collecting_special = false;
|
12027
|
-
collecting_whitespace_lookahead = false;
|
12028
|
-
}
|
12029
|
-
else {
|
12030
|
-
token += utf_char;
|
12031
|
-
}
|
12032
|
-
}
|
12033
|
-
|
12034
|
-
for (std::string & word : bpe_words) {
|
12035
|
-
std::string encoded_token = "";
|
12036
|
-
for (char & c : word) {
|
12037
|
-
encoded_token += unicode_byte_to_utf8(c);
|
12038
|
-
}
|
12039
|
-
bpe_encoded_words.emplace_back(encoded_token);
|
12040
|
-
}
|
12041
|
-
|
12042
|
-
return bpe_encoded_words;
|
12043
|
-
}
|
12044
|
-
|
12045
12384
|
const llama_vocab & vocab;
|
12046
12385
|
|
12047
12386
|
std::vector<llm_symbol> symbols;
|
@@ -12361,7 +12700,7 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
|
|
12361
12700
|
} break;
|
12362
12701
|
case LLAMA_VOCAB_TYPE_BPE:
|
12363
12702
|
{
|
12364
|
-
if (add_special && vocab.special_add_bos
|
12703
|
+
if (add_special && vocab.special_add_bos != 0) {
|
12365
12704
|
GGML_ASSERT(vocab.special_bos_id != -1);
|
12366
12705
|
output.push_back(vocab.special_bos_id);
|
12367
12706
|
}
|
@@ -13268,16 +13607,14 @@ void llama_sample_grammar(struct llama_context * ctx, llama_token_data_array * c
|
|
13268
13607
|
GGML_ASSERT(ctx);
|
13269
13608
|
const int64_t t_start_sample_us = ggml_time_us();
|
13270
13609
|
|
13271
|
-
bool
|
13610
|
+
bool allow_eog = false;
|
13272
13611
|
for (const auto & stack : grammar->stacks) {
|
13273
13612
|
if (stack.empty()) {
|
13274
|
-
|
13613
|
+
allow_eog = true;
|
13275
13614
|
break;
|
13276
13615
|
}
|
13277
13616
|
}
|
13278
13617
|
|
13279
|
-
const llama_token eos = llama_token_eos(&ctx->model);
|
13280
|
-
|
13281
13618
|
std::vector<std::pair<std::vector<uint32_t>, llama_partial_utf8>> candidates_decoded;
|
13282
13619
|
candidates_decoded.reserve(candidates->size);
|
13283
13620
|
std::vector<llama_grammar_candidate> candidates_grammar;
|
@@ -13285,9 +13622,10 @@ void llama_sample_grammar(struct llama_context * ctx, llama_token_data_array * c
|
|
13285
13622
|
|
13286
13623
|
for (size_t i = 0; i < candidates->size; ++i) {
|
13287
13624
|
const llama_token id = candidates->data[i].id;
|
13288
|
-
const std::string piece = llama_token_to_piece(ctx, id);
|
13289
|
-
|
13290
|
-
|
13625
|
+
const std::string piece = llama_token_to_piece(ctx, id, false);
|
13626
|
+
|
13627
|
+
if (llama_token_is_eog(&ctx->model, id)) {
|
13628
|
+
if (!allow_eog) {
|
13291
13629
|
candidates->data[i].logit = -INFINITY;
|
13292
13630
|
}
|
13293
13631
|
} else if (piece.empty() || piece[0] == 0) {
|
@@ -13450,7 +13788,7 @@ llama_token llama_sample_token_greedy(struct llama_context * ctx, llama_token_da
|
|
13450
13788
|
return result;
|
13451
13789
|
}
|
13452
13790
|
|
13453
|
-
llama_token
|
13791
|
+
llama_token llama_sample_token_with_rng(struct llama_context * ctx, llama_token_data_array * candidates, std::mt19937 & rng) {
|
13454
13792
|
GGML_ASSERT(ctx);
|
13455
13793
|
|
13456
13794
|
const int64_t t_start_sample_us = ggml_time_us();
|
@@ -13463,7 +13801,6 @@ llama_token llama_sample_token(struct llama_context * ctx, llama_token_data_arra
|
|
13463
13801
|
}
|
13464
13802
|
|
13465
13803
|
std::discrete_distribution<> dist(probs.begin(), probs.end());
|
13466
|
-
auto & rng = ctx->rng;
|
13467
13804
|
int idx = dist(rng);
|
13468
13805
|
|
13469
13806
|
llama_token result = candidates->data[idx].id;
|
@@ -13473,10 +13810,14 @@ llama_token llama_sample_token(struct llama_context * ctx, llama_token_data_arra
|
|
13473
13810
|
return result;
|
13474
13811
|
}
|
13475
13812
|
|
13813
|
+
llama_token llama_sample_token(struct llama_context * ctx, llama_token_data_array * candidates) {
|
13814
|
+
return llama_sample_token_with_rng(ctx, candidates, ctx->rng);
|
13815
|
+
}
|
13816
|
+
|
13476
13817
|
void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar * grammar, llama_token token) {
|
13477
13818
|
const int64_t t_start_sample_us = ggml_time_us();
|
13478
13819
|
|
13479
|
-
if (
|
13820
|
+
if (llama_token_is_eog(&ctx->model, token)) {
|
13480
13821
|
for (const auto & stack : grammar->stacks) {
|
13481
13822
|
if (stack.empty()) {
|
13482
13823
|
return;
|
@@ -13485,7 +13826,7 @@ void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar
|
|
13485
13826
|
GGML_ASSERT(false);
|
13486
13827
|
}
|
13487
13828
|
|
13488
|
-
const std::string piece = llama_token_to_piece(ctx, token);
|
13829
|
+
const std::string piece = llama_token_to_piece(ctx, token, false);
|
13489
13830
|
|
13490
13831
|
// Note terminating 0 in decoded string
|
13491
13832
|
const auto decoded = decode_utf8(piece, grammar->partial_utf8);
|
@@ -14131,14 +14472,20 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
|
|
14131
14472
|
}
|
14132
14473
|
|
14133
14474
|
static size_t llama_tensor_quantize_internal(enum ggml_type new_type, const float * f32_data, void * new_data, const int64_t chunk_size, int64_t nrows, int64_t n_per_row, const float * imatrix, std::vector<std::thread> & workers, const int nthread) {
|
14134
|
-
std::mutex mutex;
|
14135
|
-
int64_t counter = 0;
|
14136
|
-
size_t new_size = 0;
|
14137
14475
|
if (nthread < 2) {
|
14138
14476
|
// single-thread
|
14139
|
-
|
14477
|
+
size_t new_size = ggml_quantize_chunk(new_type, f32_data, new_data, 0, nrows, n_per_row, imatrix);
|
14478
|
+
if (!ggml_validate_row_data(new_type, new_data, new_size)) {
|
14479
|
+
throw std::runtime_error("quantized data validation failed");
|
14480
|
+
}
|
14481
|
+
return new_size;
|
14140
14482
|
}
|
14141
|
-
|
14483
|
+
|
14484
|
+
std::mutex mutex;
|
14485
|
+
int64_t counter = 0;
|
14486
|
+
size_t new_size = 0;
|
14487
|
+
bool valid = true;
|
14488
|
+
auto compute = [&mutex, &counter, &new_size, &valid, new_type, f32_data, new_data, chunk_size,
|
14142
14489
|
nrows, n_per_row, imatrix]() {
|
14143
14490
|
const int64_t nrows_per_chunk = chunk_size / n_per_row;
|
14144
14491
|
size_t local_size = 0;
|
@@ -14153,7 +14500,17 @@ static size_t llama_tensor_quantize_internal(enum ggml_type new_type, const floa
|
|
14153
14500
|
}
|
14154
14501
|
lock.unlock();
|
14155
14502
|
const int64_t this_nrow = std::min(nrows - first_row, nrows_per_chunk);
|
14156
|
-
|
14503
|
+
size_t this_size = ggml_quantize_chunk(new_type, f32_data, new_data, first_row * n_per_row, this_nrow, n_per_row, imatrix);
|
14504
|
+
local_size += this_size;
|
14505
|
+
|
14506
|
+
// validate the quantized data
|
14507
|
+
const size_t row_size = ggml_row_size(new_type, n_per_row);
|
14508
|
+
void * this_data = (char *) new_data + first_row * row_size;
|
14509
|
+
if (!ggml_validate_row_data(new_type, this_data, this_size)) {
|
14510
|
+
std::unique_lock<std::mutex> lock(mutex);
|
14511
|
+
valid = false;
|
14512
|
+
break;
|
14513
|
+
}
|
14157
14514
|
}
|
14158
14515
|
};
|
14159
14516
|
for (int it = 0; it < nthread - 1; ++it) {
|
@@ -14162,6 +14519,9 @@ static size_t llama_tensor_quantize_internal(enum ggml_type new_type, const floa
|
|
14162
14519
|
compute();
|
14163
14520
|
for (auto & w : workers) { w.join(); }
|
14164
14521
|
workers.clear();
|
14522
|
+
if (!valid) {
|
14523
|
+
throw std::runtime_error("quantized data validation failed");
|
14524
|
+
}
|
14165
14525
|
return new_size;
|
14166
14526
|
}
|
14167
14527
|
|
@@ -14224,7 +14584,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
14224
14584
|
auto v = (std::vector<llama_model_kv_override>*)params->kv_overrides;
|
14225
14585
|
kv_overrides = v->data();
|
14226
14586
|
}
|
14227
|
-
llama_model_loader ml(fname_inp, use_mmap, kv_overrides);
|
14587
|
+
llama_model_loader ml(fname_inp, use_mmap, /*check_tensors*/ true, kv_overrides);
|
14228
14588
|
ml.init_mappings(false); // no prefetching
|
14229
14589
|
|
14230
14590
|
llama_model model;
|
@@ -14262,11 +14622,13 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
14262
14622
|
for (auto & o : overrides) {
|
14263
14623
|
if (o.key[0] == 0) break;
|
14264
14624
|
if (o.tag == LLAMA_KV_OVERRIDE_TYPE_FLOAT) {
|
14265
|
-
gguf_set_val_f32(ctx_out, o.key, o.
|
14625
|
+
gguf_set_val_f32(ctx_out, o.key, o.val_f64);
|
14266
14626
|
} else if (o.tag == LLAMA_KV_OVERRIDE_TYPE_INT) {
|
14267
|
-
gguf_set_val_i32(ctx_out, o.key, o.
|
14627
|
+
gguf_set_val_i32(ctx_out, o.key, o.val_i64);
|
14268
14628
|
} else if (o.tag == LLAMA_KV_OVERRIDE_TYPE_BOOL) {
|
14269
|
-
gguf_set_val_bool(ctx_out, o.key, o.
|
14629
|
+
gguf_set_val_bool(ctx_out, o.key, o.val_bool);
|
14630
|
+
} else if (o.tag == LLAMA_KV_OVERRIDE_TYPE_STR) {
|
14631
|
+
gguf_set_val_str(ctx_out, o.key, o.val_str);
|
14270
14632
|
} else {
|
14271
14633
|
LLAMA_LOG_WARN("%s: unknown KV override type for key %s\n", __func__, o.key);
|
14272
14634
|
}
|
@@ -14308,26 +14670,74 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
14308
14670
|
std::vector<no_init<uint8_t>> work;
|
14309
14671
|
std::vector<no_init<float>> f32_conv_buf;
|
14310
14672
|
|
14673
|
+
uint16_t n_split = 1;
|
14674
|
+
// Assume split index is continuous
|
14675
|
+
if (params->keep_split) {
|
14676
|
+
for (int i = 0; i < ml.n_tensors; ++i) {
|
14677
|
+
n_split = std::max(uint16_t(ml.get_weight(i)->idx+1), n_split);
|
14678
|
+
}
|
14679
|
+
}
|
14680
|
+
std::vector<gguf_context*> ctx_outs(n_split, NULL);
|
14681
|
+
ctx_outs[0] = ctx_out;
|
14682
|
+
|
14311
14683
|
// populate the original tensors so we get an initial meta data
|
14312
14684
|
for (int i = 0; i < ml.n_tensors; ++i) {
|
14313
|
-
|
14314
|
-
|
14685
|
+
auto weight = ml.get_weight(i);
|
14686
|
+
uint16_t i_split = params->keep_split ? weight->idx : 0;
|
14687
|
+
struct ggml_tensor * tensor = weight->tensor;
|
14688
|
+
if (ctx_outs[i_split] == NULL) {
|
14689
|
+
ctx_outs[i_split] = gguf_init_empty();
|
14690
|
+
}
|
14691
|
+
gguf_add_tensor(ctx_outs[i_split], tensor);
|
14315
14692
|
}
|
14316
14693
|
|
14317
|
-
|
14318
|
-
|
14319
|
-
|
14320
|
-
|
14694
|
+
// Set split info if needed
|
14695
|
+
if (n_split > 1) {
|
14696
|
+
for (size_t i = 0; i < ctx_outs.size(); ++i) {
|
14697
|
+
gguf_set_val_u16(ctx_outs[i], ml.llm_kv(LLM_KV_SPLIT_NO).c_str(), i);
|
14698
|
+
gguf_set_val_u16(ctx_outs[i], ml.llm_kv(LLM_KV_SPLIT_COUNT).c_str(), n_split);
|
14699
|
+
gguf_set_val_i32(ctx_outs[i], ml.llm_kv(LLM_KV_SPLIT_TENSORS_COUNT).c_str(), ml.n_tensors);
|
14700
|
+
}
|
14701
|
+
}
|
14321
14702
|
|
14322
|
-
|
14703
|
+
int cur_split = -1;
|
14704
|
+
std::ofstream fout;
|
14705
|
+
auto close_ofstream = [&]() {
|
14706
|
+
// Write metadata and close file handler
|
14707
|
+
if (fout.is_open()) {
|
14708
|
+
fout.seekp(0);
|
14709
|
+
std::vector<uint8_t> data(gguf_get_meta_size(ctx_outs[cur_split]));
|
14710
|
+
gguf_get_meta_data(ctx_outs[cur_split], data.data());
|
14711
|
+
fout.write((const char *) data.data(), data.size());
|
14712
|
+
fout.close();
|
14713
|
+
}
|
14714
|
+
};
|
14715
|
+
auto new_ofstream = [&](int index) {
|
14716
|
+
cur_split = index;
|
14717
|
+
GGML_ASSERT(ctx_outs[cur_split] && "Find uninitialized gguf_context");
|
14718
|
+
std::string fname = fname_out;
|
14719
|
+
if (params->keep_split) {
|
14720
|
+
char split_path[PATH_MAX] = {0};
|
14721
|
+
llama_split_path(split_path, sizeof(split_path), fname_out.c_str(), cur_split, n_split);
|
14722
|
+
fname = std::string(split_path);
|
14723
|
+
}
|
14323
14724
|
|
14324
|
-
|
14325
|
-
|
14725
|
+
fout = std::ofstream(fname, std::ios::binary);
|
14726
|
+
fout.exceptions(std::ofstream::failbit); // fail fast on write errors
|
14727
|
+
const size_t meta_size = gguf_get_meta_size(ctx_outs[cur_split]);
|
14728
|
+
// placeholder for the meta data
|
14729
|
+
::zeros(fout, meta_size);
|
14730
|
+
};
|
14326
14731
|
|
14327
14732
|
const auto tn = LLM_TN(model.arch);
|
14328
|
-
|
14733
|
+
new_ofstream(0);
|
14329
14734
|
for (int i = 0; i < ml.n_tensors; ++i) {
|
14330
|
-
|
14735
|
+
auto weight = ml.get_weight(i);
|
14736
|
+
struct ggml_tensor * tensor = weight->tensor;
|
14737
|
+
if (weight->idx != cur_split && params->keep_split) {
|
14738
|
+
close_ofstream();
|
14739
|
+
new_ofstream(weight->idx);
|
14740
|
+
}
|
14331
14741
|
|
14332
14742
|
const std::string name = ggml_get_name(tensor);
|
14333
14743
|
|
@@ -14482,26 +14892,18 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
14482
14892
|
total_size_new += new_size;
|
14483
14893
|
|
14484
14894
|
// update the gguf meta data as we go
|
14485
|
-
gguf_set_tensor_type(
|
14486
|
-
gguf_set_tensor_data(
|
14895
|
+
gguf_set_tensor_type(ctx_outs[cur_split], name.c_str(), new_type);
|
14896
|
+
gguf_set_tensor_data(ctx_outs[cur_split], name.c_str(), new_data, new_size);
|
14487
14897
|
|
14488
14898
|
// write tensor data + padding
|
14489
14899
|
fout.write((const char *) new_data, new_size);
|
14490
14900
|
zeros(fout, GGML_PAD(new_size, align) - new_size);
|
14491
14901
|
}
|
14492
|
-
|
14493
|
-
|
14494
|
-
|
14495
|
-
fout.seekp(0);
|
14496
|
-
std::vector<uint8_t> data(gguf_get_meta_size(ctx_out));
|
14497
|
-
gguf_get_meta_data(ctx_out, data.data());
|
14498
|
-
fout.write((const char *) data.data(), data.size());
|
14902
|
+
close_ofstream();
|
14903
|
+
for (auto & c:ctx_outs) {
|
14904
|
+
gguf_free(c);
|
14499
14905
|
}
|
14500
14906
|
|
14501
|
-
fout.close();
|
14502
|
-
|
14503
|
-
gguf_free(ctx_out);
|
14504
|
-
|
14505
14907
|
LLAMA_LOG_INFO("%s: model size = %8.2f MB\n", __func__, total_size_org/1024.0/1024.0);
|
14506
14908
|
LLAMA_LOG_INFO("%s: quant size = %8.2f MB\n", __func__, total_size_new/1024.0/1024.0);
|
14507
14909
|
|
@@ -14545,7 +14947,7 @@ static int llama_apply_lora_from_file_internal(
|
|
14545
14947
|
std::unique_ptr<llama_model_loader> ml;
|
14546
14948
|
if (path_base_model) {
|
14547
14949
|
LLAMA_LOG_INFO("%s: loading base model from '%s'\n", __func__, path_base_model);
|
14548
|
-
ml.reset(new llama_model_loader(path_base_model, /*use_mmap*/ true, /*kv_overrides*/ nullptr));
|
14950
|
+
ml.reset(new llama_model_loader(path_base_model, /*use_mmap*/ true, /*check_tensors*/ false, /*kv_overrides*/ nullptr));
|
14549
14951
|
ml->init_mappings(/*prefetch*/ false); // no prefetching
|
14550
14952
|
}
|
14551
14953
|
|
@@ -14804,6 +15206,7 @@ struct llama_model_params llama_model_default_params() {
|
|
14804
15206
|
/*.vocab_only =*/ false,
|
14805
15207
|
/*.use_mmap =*/ true,
|
14806
15208
|
/*.use_mlock =*/ false,
|
15209
|
+
/*.check_tensors =*/ false,
|
14807
15210
|
};
|
14808
15211
|
|
14809
15212
|
#ifdef GGML_USE_METAL
|
@@ -14840,6 +15243,7 @@ struct llama_context_params llama_context_default_params() {
|
|
14840
15243
|
/*.logits_all =*/ false,
|
14841
15244
|
/*.embeddings =*/ false,
|
14842
15245
|
/*.offload_kqv =*/ true,
|
15246
|
+
/*.flash_attn =*/ false,
|
14843
15247
|
/*.abort_callback =*/ nullptr,
|
14844
15248
|
/*.abort_callback_data =*/ nullptr,
|
14845
15249
|
};
|
@@ -14857,6 +15261,7 @@ struct llama_model_quantize_params llama_model_quantize_default_params() {
|
|
14857
15261
|
/*.quantize_output_tensor =*/ true,
|
14858
15262
|
/*.only_copy =*/ false,
|
14859
15263
|
/*.pure =*/ false,
|
15264
|
+
/*.keep_split =*/ false,
|
14860
15265
|
/*.imatrix =*/ nullptr,
|
14861
15266
|
/*.kv_overrides =*/ nullptr,
|
14862
15267
|
};
|
@@ -15005,6 +15410,7 @@ struct llama_context * llama_new_context_with_model(
|
|
15005
15410
|
cparams.defrag_thold = params.defrag_thold;
|
15006
15411
|
cparams.embeddings = params.embeddings;
|
15007
15412
|
cparams.offload_kqv = params.offload_kqv;
|
15413
|
+
cparams.flash_attn = params.flash_attn;
|
15008
15414
|
cparams.pooling_type = params.pooling_type;
|
15009
15415
|
|
15010
15416
|
cparams.n_ctx = params.n_ctx == 0 ? hparams.n_ctx_train : params.n_ctx;
|
@@ -15012,12 +15418,20 @@ struct llama_context * llama_new_context_with_model(
|
|
15012
15418
|
cparams.rope_freq_scale = params.rope_freq_scale == 0.0f ? hparams.rope_freq_scale_train : params.rope_freq_scale;
|
15013
15419
|
|
15014
15420
|
// this is necessary due to kv_self.n being padded later during inference
|
15015
|
-
cparams.n_ctx
|
15421
|
+
cparams.n_ctx = GGML_PAD(cparams.n_ctx, 256);
|
15016
15422
|
|
15017
15423
|
// with causal attention, the batch size is limited by the context size
|
15018
15424
|
cparams.n_batch = hparams.causal_attn ? std::min(cparams.n_ctx, params.n_batch) : params.n_batch;
|
15019
|
-
cparams.n_ubatch = std::min(cparams.n_batch, params.n_ubatch == 0 ? params.n_batch : params.n_ubatch);
|
15020
15425
|
|
15426
|
+
// the batch has to be at least GGML_KQ_MASK_PAD because we will be padding the KQ_mask
|
15427
|
+
// this is required by GPU kernels in order to avoid out-of-bounds accesses (e.g. ggml_flash_attn_ext)
|
15428
|
+
// ref: https://github.com/ggerganov/llama.cpp/pull/5021
|
15429
|
+
if (cparams.n_batch < GGML_KQ_MASK_PAD) {
|
15430
|
+
LLAMA_LOG_WARN("%s: n_batch is less than GGML_KQ_MASK_PAD - increasing to %d\n", __func__, GGML_KQ_MASK_PAD);
|
15431
|
+
cparams.n_batch = GGML_KQ_MASK_PAD;
|
15432
|
+
}
|
15433
|
+
|
15434
|
+
cparams.n_ubatch = std::min(cparams.n_batch, params.n_ubatch == 0 ? params.n_batch : params.n_ubatch);
|
15021
15435
|
|
15022
15436
|
cparams.n_yarn_orig_ctx = params.yarn_orig_ctx != 0 ? params.yarn_orig_ctx :
|
15023
15437
|
hparams.n_yarn_orig_ctx != 0 ? hparams.n_yarn_orig_ctx :
|
@@ -15049,6 +15463,23 @@ struct llama_context * llama_new_context_with_model(
|
|
15049
15463
|
}
|
15050
15464
|
}
|
15051
15465
|
|
15466
|
+
if (cparams.flash_attn && hparams.use_alibi) {
|
15467
|
+
LLAMA_LOG_WARN("%s: flash_attn is not yet compatible with ALiBi - forcing off\n", __func__);
|
15468
|
+
cparams.flash_attn = false;
|
15469
|
+
}
|
15470
|
+
|
15471
|
+
if (cparams.flash_attn && model->arch == LLM_ARCH_GROK) {
|
15472
|
+
LLAMA_LOG_WARN("%s: flash_attn is not compatible with Grok - forcing off\n", __func__);
|
15473
|
+
cparams.flash_attn = false;
|
15474
|
+
}
|
15475
|
+
|
15476
|
+
#ifdef GGML_USE_HIPBLAS
|
15477
|
+
if (cparams.flash_attn) {
|
15478
|
+
LLAMA_LOG_WARN("%s: flash_attn is not yet compatible with HIPBLAS builds - forcing off\n", __func__);
|
15479
|
+
cparams.flash_attn = false;
|
15480
|
+
}
|
15481
|
+
#endif
|
15482
|
+
|
15052
15483
|
if (params.seed == LLAMA_DEFAULT_SEED) {
|
15053
15484
|
params.seed = time(NULL);
|
15054
15485
|
}
|
@@ -15056,6 +15487,7 @@ struct llama_context * llama_new_context_with_model(
|
|
15056
15487
|
LLAMA_LOG_INFO("%s: n_ctx = %u\n", __func__, cparams.n_ctx);
|
15057
15488
|
LLAMA_LOG_INFO("%s: n_batch = %u\n", __func__, cparams.n_batch);
|
15058
15489
|
LLAMA_LOG_INFO("%s: n_ubatch = %u\n", __func__, cparams.n_ubatch);
|
15490
|
+
LLAMA_LOG_INFO("%s: flash_attn = %d\n", __func__, cparams.flash_attn);
|
15059
15491
|
LLAMA_LOG_INFO("%s: freq_base = %.1f\n", __func__, cparams.rope_freq_base);
|
15060
15492
|
LLAMA_LOG_INFO("%s: freq_scale = %g\n", __func__, cparams.rope_freq_scale);
|
15061
15493
|
|
@@ -15184,7 +15616,7 @@ struct llama_context * llama_new_context_with_model(
|
|
15184
15616
|
}
|
15185
15617
|
ctx->backends.push_back(ctx->backend_cpu);
|
15186
15618
|
|
15187
|
-
if (!llama_kv_cache_init(ctx->kv_self, ctx
|
15619
|
+
if (!llama_kv_cache_init(ctx->kv_self, ctx, type_k, type_v, kv_size, cparams.offload_kqv)) {
|
15188
15620
|
LLAMA_LOG_ERROR("%s: llama_kv_cache_init() failed for self-attention cache\n", __func__);
|
15189
15621
|
llama_free(ctx);
|
15190
15622
|
return nullptr;
|
@@ -15365,6 +15797,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
|
|
15365
15797
|
case LLM_ARCH_QWEN2:
|
15366
15798
|
case LLM_ARCH_QWEN2MOE:
|
15367
15799
|
case LLM_ARCH_PHI2:
|
15800
|
+
case LLM_ARCH_PHI3:
|
15368
15801
|
case LLM_ARCH_GEMMA:
|
15369
15802
|
case LLM_ARCH_STARCODER2:
|
15370
15803
|
return LLAMA_ROPE_TYPE_NEOX;
|
@@ -15378,6 +15811,10 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
|
|
15378
15811
|
return LLAMA_ROPE_TYPE_NONE;
|
15379
15812
|
}
|
15380
15813
|
|
15814
|
+
enum llama_pooling_type llama_pooling_type(const struct llama_context * ctx) {
|
15815
|
+
return ctx->cparams.pooling_type;
|
15816
|
+
}
|
15817
|
+
|
15381
15818
|
int32_t llama_n_vocab(const struct llama_model * model) {
|
15382
15819
|
return model->hparams.n_vocab;
|
15383
15820
|
}
|
@@ -15778,6 +16215,7 @@ size_t llama_state_get_size(const struct llama_context * ctx) {
|
|
15778
16215
|
const size_t s_kv_head = sizeof(uint32_t);
|
15779
16216
|
const size_t s_kv_size = sizeof(uint32_t);
|
15780
16217
|
const size_t s_kv_used = sizeof(uint32_t);
|
16218
|
+
const size_t s_v_trans = sizeof(uint32_t);
|
15781
16219
|
const size_t s_kv = ctx->kv_self.total_size();
|
15782
16220
|
const size_t s_kv_cell = sizeof(llama_pos) + sizeof(size_t) + cparams.n_seq_max*sizeof(llama_seq_id);
|
15783
16221
|
const size_t s_kv_cells = ctx->kv_self.size * s_kv_cell;
|
@@ -15795,10 +16233,14 @@ size_t llama_state_get_size(const struct llama_context * ctx) {
|
|
15795
16233
|
+ s_kv_head
|
15796
16234
|
+ s_kv_size
|
15797
16235
|
+ s_kv_used
|
16236
|
+
+ s_v_trans
|
15798
16237
|
+ s_kv
|
15799
16238
|
+ s_kv_cells
|
15800
16239
|
);
|
15801
16240
|
|
16241
|
+
// on session change it is very likely that the state size has changed - so we need to update this function
|
16242
|
+
static_assert(LLAMA_SESSION_VERSION == 6, "So you just bumped the session version - good. But did you remember to update llama_state_get_size?");
|
16243
|
+
|
15802
16244
|
return s_total;
|
15803
16245
|
}
|
15804
16246
|
|
@@ -15856,6 +16298,8 @@ struct llama_data_file_context : llama_data_context {
|
|
15856
16298
|
*
|
15857
16299
|
*/
|
15858
16300
|
static void llama_state_get_data_internal(struct llama_context * ctx, llama_data_context * data_ctx) {
|
16301
|
+
llama_synchronize(ctx);
|
16302
|
+
|
15859
16303
|
// copy rng
|
15860
16304
|
{
|
15861
16305
|
std::ostringstream rng_ss;
|
@@ -15942,11 +16386,13 @@ static void llama_state_get_data_internal(struct llama_context * ctx, llama_data
|
|
15942
16386
|
const uint32_t kv_size = kv_self.size;
|
15943
16387
|
const size_t kv_buf_size = kv_self.total_size() / (kv_size ? kv_size : 1) * kv_head;
|
15944
16388
|
const uint32_t kv_used = kv_self.used;
|
16389
|
+
const uint32_t v_trans = kv_self.v_trans ? 1 : 0;
|
15945
16390
|
|
15946
16391
|
data_ctx->write(&kv_buf_size, sizeof(kv_buf_size));
|
15947
16392
|
data_ctx->write(&kv_head, sizeof(kv_head));
|
15948
16393
|
data_ctx->write(&kv_size, sizeof(kv_size));
|
15949
16394
|
data_ctx->write(&kv_used, sizeof(kv_used));
|
16395
|
+
data_ctx->write(&v_trans, sizeof(v_trans));
|
15950
16396
|
|
15951
16397
|
if (kv_buf_size) {
|
15952
16398
|
const size_t pre_kv_buf_size = data_ctx->get_size_written();
|
@@ -15959,7 +16405,7 @@ static void llama_state_get_data_internal(struct llama_context * ctx, llama_data
|
|
15959
16405
|
ggml_backend_tensor_get(kv_self.k_l[il], tmp_buf.data(), 0, tmp_buf.size());
|
15960
16406
|
data_ctx->write(tmp_buf.data(), tmp_buf.size());
|
15961
16407
|
|
15962
|
-
if (kv_self.recurrent) {
|
16408
|
+
if (kv_self.recurrent || !kv_self.v_trans) {
|
15963
16409
|
// v is contiguous for recurrent models
|
15964
16410
|
// TODO: use other tensors for state models than k and v
|
15965
16411
|
const size_t v_size = ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa*kv_head);
|
@@ -16008,6 +16454,8 @@ size_t llama_state_get_data(struct llama_context * ctx, uint8_t * dst) {
|
|
16008
16454
|
|
16009
16455
|
// Sets the state reading from the specified source address
|
16010
16456
|
size_t llama_state_set_data(struct llama_context * ctx, const uint8_t * src) {
|
16457
|
+
llama_synchronize(ctx);
|
16458
|
+
|
16011
16459
|
const uint8_t * inp = src;
|
16012
16460
|
|
16013
16461
|
// set rng
|
@@ -16090,11 +16538,15 @@ size_t llama_state_set_data(struct llama_context * ctx, const uint8_t * src) {
|
|
16090
16538
|
uint32_t kv_head;
|
16091
16539
|
uint32_t kv_size;
|
16092
16540
|
uint32_t kv_used;
|
16541
|
+
uint32_t v_trans;
|
16093
16542
|
|
16094
16543
|
memcpy(&kv_buf_size, inp, sizeof(kv_buf_size)); inp += sizeof(kv_buf_size);
|
16095
16544
|
memcpy(&kv_head, inp, sizeof(kv_head)); inp += sizeof(kv_head);
|
16096
16545
|
memcpy(&kv_size, inp, sizeof(kv_size)); inp += sizeof(kv_size);
|
16097
16546
|
memcpy(&kv_used, inp, sizeof(kv_used)); inp += sizeof(kv_used);
|
16547
|
+
memcpy(&v_trans, inp, sizeof(v_trans)); inp += sizeof(v_trans);
|
16548
|
+
|
16549
|
+
GGML_ASSERT(kv_self.v_trans == (bool) v_trans); // incompatible V transposition
|
16098
16550
|
|
16099
16551
|
if (kv_self.size != kv_size) {
|
16100
16552
|
// the KV cache needs to be big enough to load all the KV cells from the saved state
|
@@ -16104,6 +16556,8 @@ size_t llama_state_set_data(struct llama_context * ctx, const uint8_t * src) {
|
|
16104
16556
|
__func__, kv_head, kv_size, kv_self.size);
|
16105
16557
|
}
|
16106
16558
|
|
16559
|
+
llama_kv_cache_clear(ctx);
|
16560
|
+
|
16107
16561
|
if (kv_buf_size) {
|
16108
16562
|
const size_t pre_kv_buf_size = inp - src;
|
16109
16563
|
|
@@ -16115,7 +16569,7 @@ size_t llama_state_set_data(struct llama_context * ctx, const uint8_t * src) {
|
|
16115
16569
|
ggml_backend_tensor_set(kv_self.k_l[il], inp, 0, k_size);
|
16116
16570
|
inp += k_size;
|
16117
16571
|
|
16118
|
-
if (kv_self.recurrent) {
|
16572
|
+
if (kv_self.recurrent || !kv_self.v_trans) {
|
16119
16573
|
// v is contiguous for recurrent models
|
16120
16574
|
// TODO: use other tensors for state models than k and v
|
16121
16575
|
const size_t v_size = ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa*kv_head);
|
@@ -16137,8 +16591,6 @@ size_t llama_state_set_data(struct llama_context * ctx, const uint8_t * src) {
|
|
16137
16591
|
GGML_ASSERT(kv_buf_size == inp - src - pre_kv_buf_size);
|
16138
16592
|
}
|
16139
16593
|
|
16140
|
-
llama_kv_cache_clear(ctx);
|
16141
|
-
|
16142
16594
|
ctx->kv_self.head = kv_head;
|
16143
16595
|
ctx->kv_self.used = kv_used;
|
16144
16596
|
|
@@ -16312,6 +16764,8 @@ size_t llama_state_seq_get_size(struct llama_context* ctx, llama_seq_id seq_id)
|
|
16312
16764
|
}
|
16313
16765
|
|
16314
16766
|
static size_t llama_state_seq_get_data_internal(struct llama_context * ctx, llama_data_context & data_ctx, llama_seq_id seq_id) {
|
16767
|
+
llama_synchronize(ctx);
|
16768
|
+
|
16315
16769
|
const auto & kv_self = ctx->kv_self;
|
16316
16770
|
GGML_ASSERT(!kv_self.recurrent); // not implemented
|
16317
16771
|
|
@@ -16396,28 +16850,49 @@ static size_t llama_state_seq_get_data_internal(struct llama_context * ctx, llam
|
|
16396
16850
|
}
|
16397
16851
|
}
|
16398
16852
|
|
16399
|
-
//
|
16400
|
-
|
16401
|
-
|
16402
|
-
|
16403
|
-
|
16404
|
-
|
16853
|
+
// TODO: simplify, reduce copy-paste
|
16854
|
+
if (!kv_self.v_trans) {
|
16855
|
+
for (int il = 0; il < (int)n_layer; ++il) {
|
16856
|
+
// Write value type
|
16857
|
+
const int32_t v_type_i = (int32_t)kv_self.v_l[il]->type;
|
16858
|
+
data_ctx.write(&v_type_i, sizeof(v_type_i));
|
16405
16859
|
|
16406
|
-
|
16407
|
-
|
16408
|
-
|
16860
|
+
// Write row size of value
|
16861
|
+
const size_t v_size_row = ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa);
|
16862
|
+
data_ctx.write(&v_size_row, sizeof(v_size_row));
|
16409
16863
|
|
16410
|
-
|
16411
|
-
for (uint32_t j = 0; j < n_embd_v_gqa; ++j) {
|
16412
|
-
// Read each range of cells of v_size_el length each into tmp_buf and write out
|
16864
|
+
// Read each range of cells of v_size length each into tmp_buf and write out
|
16413
16865
|
for (const auto & range : cell_ranges) {
|
16414
16866
|
const size_t range_size = range.second - range.first;
|
16415
|
-
|
16416
|
-
tmp_buf.
|
16417
|
-
ggml_backend_tensor_get(kv_self.v_l[il], tmp_buf.data(), src_offset, tmp_buf.size());
|
16867
|
+
tmp_buf.resize(range_size * v_size_row);
|
16868
|
+
ggml_backend_tensor_get(kv_self.v_l[il], tmp_buf.data(), range.first * v_size_row, range_size * v_size_row);
|
16418
16869
|
data_ctx.write(tmp_buf.data(), tmp_buf.size());
|
16419
16870
|
}
|
16420
16871
|
}
|
16872
|
+
} else {
|
16873
|
+
// For the values, they are transposed, so we also need the element size and get the element ranges from each row
|
16874
|
+
const uint32_t kv_size = kv_self.size;
|
16875
|
+
for (int il = 0; il < (int)n_layer; ++il) {
|
16876
|
+
// Write value type
|
16877
|
+
const int32_t v_type_i = (int32_t)kv_self.v_l[il]->type;
|
16878
|
+
data_ctx.write(&v_type_i, sizeof(v_type_i));
|
16879
|
+
|
16880
|
+
// Write element size
|
16881
|
+
const size_t v_size_el = ggml_type_size(kv_self.v_l[il]->type);
|
16882
|
+
data_ctx.write(&v_size_el, sizeof(v_size_el));
|
16883
|
+
|
16884
|
+
// For each row, we get the element values of each cell
|
16885
|
+
for (uint32_t j = 0; j < n_embd_v_gqa; ++j) {
|
16886
|
+
// Read each range of cells of v_size_el length each into tmp_buf and write out
|
16887
|
+
for (const auto & range : cell_ranges) {
|
16888
|
+
const size_t range_size = range.second - range.first;
|
16889
|
+
const size_t src_offset = (range.first + j * kv_size) * v_size_el;
|
16890
|
+
tmp_buf.resize(range_size * v_size_el);
|
16891
|
+
ggml_backend_tensor_get(kv_self.v_l[il], tmp_buf.data(), src_offset, tmp_buf.size());
|
16892
|
+
data_ctx.write(tmp_buf.data(), tmp_buf.size());
|
16893
|
+
}
|
16894
|
+
}
|
16895
|
+
}
|
16421
16896
|
}
|
16422
16897
|
|
16423
16898
|
return data_ctx.get_size_written();
|
@@ -16429,6 +16904,8 @@ size_t llama_state_seq_get_data(struct llama_context* ctx, uint8_t* dst, llama_s
|
|
16429
16904
|
}
|
16430
16905
|
|
16431
16906
|
size_t llama_state_seq_set_data(struct llama_context * ctx, const uint8_t * src, llama_seq_id dest_seq_id) {
|
16907
|
+
llama_synchronize(ctx);
|
16908
|
+
|
16432
16909
|
auto & kv_self = ctx->kv_self;
|
16433
16910
|
GGML_ASSERT(!kv_self.recurrent); // not implemented
|
16434
16911
|
|
@@ -16540,41 +17017,75 @@ size_t llama_state_seq_set_data(struct llama_context * ctx, const uint8_t * src,
|
|
16540
17017
|
}
|
16541
17018
|
}
|
16542
17019
|
|
16543
|
-
//
|
16544
|
-
|
16545
|
-
|
16546
|
-
|
16547
|
-
|
16548
|
-
|
16549
|
-
|
16550
|
-
|
16551
|
-
|
16552
|
-
|
16553
|
-
|
16554
|
-
|
17020
|
+
// TODO: simplify, reduce copy-paste
|
17021
|
+
if (!kv_self.v_trans) {
|
17022
|
+
for (int il = 0; il < (int)n_layer; ++il) {
|
17023
|
+
// Read type of value
|
17024
|
+
int32_t v_type_i_ref;
|
17025
|
+
memcpy(&v_type_i_ref, inp, sizeof(v_type_i_ref));
|
17026
|
+
inp += sizeof(v_type_i_ref);
|
17027
|
+
const int32_t v_type_i = (int32_t)kv_self.v_l[il]->type;
|
17028
|
+
if (v_type_i != v_type_i_ref) {
|
17029
|
+
llama_kv_cache_seq_rm(kv_self, dest_seq_id, -1, -1);
|
17030
|
+
LLAMA_LOG_ERROR("%s: mismatched value type (%d != %d, layer %d)\n", __func__, v_type_i, v_type_i_ref, il);
|
17031
|
+
return 0;
|
17032
|
+
}
|
16555
17033
|
|
16556
|
-
|
16557
|
-
|
16558
|
-
|
16559
|
-
|
16560
|
-
|
16561
|
-
|
16562
|
-
|
16563
|
-
|
16564
|
-
|
16565
|
-
|
17034
|
+
// Read row size of value
|
17035
|
+
size_t v_size_row_ref;
|
17036
|
+
memcpy(&v_size_row_ref, inp, sizeof(v_size_row_ref));
|
17037
|
+
inp += sizeof(v_size_row_ref);
|
17038
|
+
const size_t v_size_row = ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa);
|
17039
|
+
if (v_size_row != v_size_row_ref) {
|
17040
|
+
llama_kv_cache_seq_rm(kv_self, dest_seq_id, -1, -1);
|
17041
|
+
LLAMA_LOG_ERROR("%s: mismatched value row size (%zu != %zu, layer %d)\n", __func__, v_size_row, v_size_row_ref, il);
|
17042
|
+
return 0;
|
17043
|
+
}
|
16566
17044
|
|
16567
|
-
|
16568
|
-
|
16569
|
-
|
16570
|
-
|
16571
|
-
|
16572
|
-
|
17045
|
+
if (cell_count) {
|
17046
|
+
// Read and set the values for the whole cell range
|
17047
|
+
ggml_backend_tensor_set(kv_self.v_l[il], inp, kv_head * v_size_row, cell_count * v_size_row);
|
17048
|
+
inp += cell_count * v_size_row;
|
17049
|
+
}
|
17050
|
+
}
|
17051
|
+
} else {
|
17052
|
+
// For each layer, read the values for each cell (transposed)
|
17053
|
+
for (int il = 0; il < (int)n_layer; ++il) {
|
17054
|
+
// Read type of value
|
17055
|
+
int32_t v_type_i_ref;
|
17056
|
+
memcpy(&v_type_i_ref, inp, sizeof(v_type_i_ref));
|
17057
|
+
inp += sizeof(v_type_i_ref);
|
17058
|
+
const int32_t v_type_i = (int32_t)kv_self.v_l[il]->type;
|
17059
|
+
if (v_type_i != v_type_i_ref) {
|
17060
|
+
llama_kv_cache_seq_rm(kv_self, dest_seq_id, -1, -1);
|
17061
|
+
LLAMA_LOG_ERROR("%s: mismatched value type (%d != %d, layer %d)\n", __func__, v_type_i, v_type_i_ref, il);
|
17062
|
+
return 0;
|
17063
|
+
}
|
17064
|
+
|
17065
|
+
// Read element size of value
|
17066
|
+
size_t v_size_el_ref;
|
17067
|
+
memcpy(&v_size_el_ref, inp, sizeof(v_size_el_ref));
|
17068
|
+
inp += sizeof(v_size_el_ref);
|
17069
|
+
const size_t v_size_el = ggml_type_size(kv_self.v_l[il]->type);
|
17070
|
+
if (v_size_el != v_size_el_ref) {
|
17071
|
+
llama_kv_cache_seq_rm(kv_self, dest_seq_id, -1, -1);
|
17072
|
+
LLAMA_LOG_ERROR("%s: mismatched value element size (%zu != %zu, layer %d)\n", __func__, v_size_el, v_size_el_ref, il);
|
17073
|
+
return 0;
|
17074
|
+
}
|
17075
|
+
|
17076
|
+
if (cell_count) {
|
17077
|
+
// For each row in the transposed matrix, read the values for the whole cell range
|
17078
|
+
for (uint32_t j = 0; j < n_embd_v_gqa; ++j) {
|
17079
|
+
const size_t dst_offset = (kv_head + j * kv_size) * v_size_el;
|
17080
|
+
ggml_backend_tensor_set(kv_self.v_l[il], inp, dst_offset, cell_count * v_size_el);
|
17081
|
+
inp += cell_count * v_size_el;
|
17082
|
+
}
|
16573
17083
|
}
|
16574
17084
|
}
|
16575
17085
|
}
|
16576
17086
|
|
16577
17087
|
const size_t nread = inp - src;
|
17088
|
+
|
16578
17089
|
return nread;
|
16579
17090
|
}
|
16580
17091
|
|
@@ -16880,6 +17391,13 @@ llama_token_type llama_token_get_type(const struct llama_model * model, llama_to
|
|
16880
17391
|
return model->vocab.id_to_token[token].type;
|
16881
17392
|
}
|
16882
17393
|
|
17394
|
+
bool llama_token_is_eog(const struct llama_model * model, llama_token token) {
|
17395
|
+
return token != -1 && (
|
17396
|
+
token == llama_token_eos(model) ||
|
17397
|
+
token == llama_token_eot(model)
|
17398
|
+
);
|
17399
|
+
}
|
17400
|
+
|
16883
17401
|
llama_token llama_token_bos(const struct llama_model * model) {
|
16884
17402
|
return model->vocab.special_bos_id;
|
16885
17403
|
}
|
@@ -16957,7 +17475,7 @@ static std::string llama_decode_text(const std::string & text) {
|
|
16957
17475
|
}
|
16958
17476
|
|
16959
17477
|
// does not write null-terminator to buf
|
16960
|
-
int32_t llama_token_to_piece(const struct llama_model * model, llama_token token, char * buf, int32_t length) {
|
17478
|
+
int32_t llama_token_to_piece(const struct llama_model * model, llama_token token, char * buf, int32_t length, bool special) {
|
16961
17479
|
if (0 <= token && token < llama_n_vocab(model)) {
|
16962
17480
|
switch (llama_vocab_get_type(model->vocab)) {
|
16963
17481
|
case LLAMA_VOCAB_TYPE_WPM:
|
@@ -16972,7 +17490,9 @@ int32_t llama_token_to_piece(const struct llama_model * model, llama_token token
|
|
16972
17490
|
}
|
16973
17491
|
memcpy(buf, result.c_str(), result.length());
|
16974
17492
|
return result.length();
|
16975
|
-
} else if (
|
17493
|
+
} else if (
|
17494
|
+
(llama_is_user_defined_token(model->vocab, token)) ||
|
17495
|
+
(llama_is_control_token (model->vocab, token) && special)) {
|
16976
17496
|
std::string result = model->vocab.id_to_token[token].text;
|
16977
17497
|
if (length < (int) result.length()) {
|
16978
17498
|
return -(int) result.length();
|
@@ -16985,8 +17505,6 @@ int32_t llama_token_to_piece(const struct llama_model * model, llama_token token
|
|
16985
17505
|
}
|
16986
17506
|
memcpy(buf, "\xe2\x96\x85", 3);
|
16987
17507
|
return 3;
|
16988
|
-
} else if (llama_is_control_token(model->vocab, token)) {
|
16989
|
-
;
|
16990
17508
|
} else if (llama_is_byte_token(model->vocab, token)) {
|
16991
17509
|
if (length < 1) {
|
16992
17510
|
return -1;
|
@@ -17007,15 +17525,15 @@ int32_t llama_token_to_piece(const struct llama_model * model, llama_token token
|
|
17007
17525
|
}
|
17008
17526
|
memcpy(buf, result.c_str(), result.length());
|
17009
17527
|
return result.length();
|
17010
|
-
} else if (
|
17528
|
+
} else if (
|
17529
|
+
(llama_is_user_defined_token(model->vocab, token)) ||
|
17530
|
+
(llama_is_control_token (model->vocab, token) && special)) {
|
17011
17531
|
std::string result = model->vocab.id_to_token[token].text;
|
17012
17532
|
if (length < (int) result.length()) {
|
17013
17533
|
return -(int) result.length();
|
17014
17534
|
}
|
17015
17535
|
memcpy(buf, result.c_str(), result.length());
|
17016
17536
|
return result.length();
|
17017
|
-
} else if (llama_is_control_token(model->vocab, token)) {
|
17018
|
-
;
|
17019
17537
|
}
|
17020
17538
|
break;
|
17021
17539
|
}
|
@@ -17213,6 +17731,24 @@ static int32_t llama_chat_apply_template_internal(
|
|
17213
17731
|
if (add_ass) {
|
17214
17732
|
ss << "<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>";
|
17215
17733
|
}
|
17734
|
+
} else if (tmpl == "llama3" || (tmpl.find("<|start_header_id|>") != std::string::npos && tmpl.find("<|end_header_id|>") != std::string::npos)) {
|
17735
|
+
// Llama 3
|
17736
|
+
for (auto message : chat) {
|
17737
|
+
std::string role(message->role);
|
17738
|
+
ss << "<|start_header_id|>" << role << "<|end_header_id|>\n\n" << trim(message->content) << "<|eot_id|>";
|
17739
|
+
}
|
17740
|
+
if (add_ass) {
|
17741
|
+
ss << "<|start_header_id|>assistant<|end_header_id|>\n\n";
|
17742
|
+
}
|
17743
|
+
} else if (tmpl == "phi3" || (tmpl.find("<|assistant|>") != std::string::npos && tmpl.find("<|end|>") != std::string::npos )) {
|
17744
|
+
// Phi 3
|
17745
|
+
for (auto message : chat) {
|
17746
|
+
std::string role(message->role);
|
17747
|
+
ss << "<|" << role << "|>\n" << trim(message->content) << "<|end|>\n";
|
17748
|
+
}
|
17749
|
+
if (add_ass) {
|
17750
|
+
ss << "<|assistant|>\n";
|
17751
|
+
}
|
17216
17752
|
} else {
|
17217
17753
|
// template not supported
|
17218
17754
|
return -1;
|
@@ -17345,6 +17881,11 @@ const char * llama_print_system_info(void) {
|
|
17345
17881
|
s += "SSSE3 = " + std::to_string(ggml_cpu_has_ssse3()) + " | ";
|
17346
17882
|
s += "VSX = " + std::to_string(ggml_cpu_has_vsx()) + " | ";
|
17347
17883
|
s += "MATMUL_INT8 = " + std::to_string(ggml_cpu_has_matmul_int8()) + " | ";
|
17884
|
+
#ifdef GGML_USE_LLAMAFILE
|
17885
|
+
s += "LLAMAFILE = 1 | ";
|
17886
|
+
#else
|
17887
|
+
s += "LLAMAFILE = 0 | ";
|
17888
|
+
#endif
|
17348
17889
|
|
17349
17890
|
return s.c_str();
|
17350
17891
|
}
|