@fugood/llama.node 0.0.1-alpha.3 → 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CMakeLists.txt +36 -7
- package/README.md +9 -0
- package/bin/darwin/arm64/default.metallib +0 -0
- package/bin/darwin/arm64/llama-node.node +0 -0
- package/bin/darwin/x64/default.metallib +0 -0
- package/bin/darwin/x64/llama-node.node +0 -0
- package/bin/linux/arm64/llama-node.node +0 -0
- package/bin/linux/x64/llama-node.node +0 -0
- package/bin/win32/arm64/llama-node.node +0 -0
- package/bin/win32/arm64/node.lib +0 -0
- package/bin/win32/x64/llama-node.node +0 -0
- package/bin/win32/x64/node.lib +0 -0
- package/lib/binding.js +18 -1
- package/lib/binding.ts +22 -2
- package/lib/index.ts +2 -2
- package/package.json +15 -3
- package/src/LlamaCompletionWorker.cpp +5 -1
- package/src/LlamaCompletionWorker.h +4 -0
- package/src/LlamaContext.cpp +18 -1
- package/src/common.hpp +11 -7
- package/src/llama.cpp/CMakeLists.txt +13 -7
- package/src/llama.cpp/common/common.cpp +221 -173
- package/src/llama.cpp/common/common.h +19 -8
- package/src/llama.cpp/common/json-schema-to-grammar.h +4 -0
- package/src/llama.cpp/common/log.h +2 -2
- package/src/llama.cpp/common/sampling.cpp +17 -1
- package/src/llama.cpp/common/sampling.h +28 -20
- package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +17 -11
- package/src/llama.cpp/examples/eval-callback/eval-callback.cpp +5 -5
- package/src/llama.cpp/examples/finetune/finetune.cpp +1 -1
- package/src/llama.cpp/examples/gguf-split/gguf-split.cpp +15 -4
- package/src/llama.cpp/examples/imatrix/imatrix.cpp +72 -39
- package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +42 -3
- package/src/llama.cpp/examples/llava/clip.cpp +74 -23
- package/src/llama.cpp/examples/llava/llava-cli.cpp +37 -28
- package/src/llama.cpp/examples/lookup/lookup-stats.cpp +0 -1
- package/src/llama.cpp/examples/lookup/lookup.cpp +0 -1
- package/src/llama.cpp/examples/main/main.cpp +10 -8
- package/src/llama.cpp/examples/perplexity/perplexity.cpp +175 -55
- package/src/llama.cpp/examples/quantize/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/quantize/quantize.cpp +74 -47
- package/src/llama.cpp/examples/quantize-stats/quantize-stats.cpp +1 -1
- package/src/llama.cpp/examples/server/server.cpp +97 -86
- package/src/llama.cpp/examples/server/utils.hpp +17 -15
- package/src/llama.cpp/ggml-backend.c +7 -5
- package/src/llama.cpp/ggml-impl.h +339 -4
- package/src/llama.cpp/ggml-kompute.cpp +7 -0
- package/src/llama.cpp/ggml-opencl.cpp +1 -0
- package/src/llama.cpp/ggml-quants.c +302 -293
- package/src/llama.cpp/ggml-sycl.cpp +28 -16
- package/src/llama.cpp/ggml-vulkan-shaders.hpp +46843 -39205
- package/src/llama.cpp/ggml-vulkan.cpp +951 -263
- package/src/llama.cpp/ggml.c +1469 -116
- package/src/llama.cpp/ggml.h +37 -7
- package/src/llama.cpp/llama.cpp +969 -432
- package/src/llama.cpp/llama.h +46 -14
- package/src/llama.cpp/requirements/requirements-convert-hf-to-gguf-update.txt +2 -0
- package/src/llama.cpp/requirements/requirements-convert-hf-to-gguf.txt +0 -1
- package/src/llama.cpp/requirements/requirements-convert.txt +2 -2
- package/src/llama.cpp/requirements.txt +1 -0
- package/src/llama.cpp/sgemm.cpp +134 -103
- package/src/llama.cpp/sgemm.h +4 -2
- package/src/llama.cpp/tests/CMakeLists.txt +96 -36
- package/src/llama.cpp/tests/test-backend-ops.cpp +56 -6
- package/src/llama.cpp/tests/test-chat-template.cpp +4 -0
- package/src/llama.cpp/tests/test-grammar-integration.cpp +225 -136
- package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +1 -0
- package/src/llama.cpp/tests/test-tokenizer-0.cpp +292 -0
- package/src/llama.cpp/tests/{test-tokenizer-1-llama.cpp → test-tokenizer-1-spm.cpp} +1 -1
- package/src/llama.cpp/unicode-data.cpp +1188 -656
- package/src/llama.cpp/unicode-data.h +4 -3
- package/src/llama.cpp/unicode.cpp +590 -49
- package/src/llama.cpp/unicode.h +6 -3
- package/src/llama.cpp/tests/test-tokenizer-0-falcon.cpp +0 -187
- package/src/llama.cpp/tests/test-tokenizer-0-llama.cpp +0 -190
package/src/llama.cpp/llama.cpp
CHANGED
|
@@ -75,6 +75,7 @@
|
|
|
75
75
|
#include <forward_list>
|
|
76
76
|
#include <fstream>
|
|
77
77
|
#include <functional>
|
|
78
|
+
#include <future>
|
|
78
79
|
#include <initializer_list>
|
|
79
80
|
#include <locale>
|
|
80
81
|
#include <map>
|
|
@@ -107,7 +108,6 @@
|
|
|
107
108
|
#define LLAMA_MAX_NODES 8192
|
|
108
109
|
#define LLAMA_MAX_EXPERTS 60
|
|
109
110
|
|
|
110
|
-
|
|
111
111
|
//
|
|
112
112
|
// logging
|
|
113
113
|
//
|
|
@@ -211,6 +211,7 @@ enum llm_arch {
|
|
|
211
211
|
LLM_ARCH_QWEN2,
|
|
212
212
|
LLM_ARCH_QWEN2MOE,
|
|
213
213
|
LLM_ARCH_PHI2,
|
|
214
|
+
LLM_ARCH_PHI3,
|
|
214
215
|
LLM_ARCH_PLAMO,
|
|
215
216
|
LLM_ARCH_CODESHELL,
|
|
216
217
|
LLM_ARCH_ORION,
|
|
@@ -246,6 +247,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
|
|
246
247
|
{ LLM_ARCH_QWEN2, "qwen2" },
|
|
247
248
|
{ LLM_ARCH_QWEN2MOE, "qwen2moe" },
|
|
248
249
|
{ LLM_ARCH_PHI2, "phi2" },
|
|
250
|
+
{ LLM_ARCH_PHI3, "phi3" },
|
|
249
251
|
{ LLM_ARCH_PLAMO, "plamo" },
|
|
250
252
|
{ LLM_ARCH_CODESHELL, "codeshell" },
|
|
251
253
|
{ LLM_ARCH_ORION, "orion" },
|
|
@@ -314,6 +316,7 @@ enum llm_kv {
|
|
|
314
316
|
LLM_KV_SSM_TIME_STEP_RANK,
|
|
315
317
|
|
|
316
318
|
LLM_KV_TOKENIZER_MODEL,
|
|
319
|
+
LLM_KV_TOKENIZER_PRE,
|
|
317
320
|
LLM_KV_TOKENIZER_LIST,
|
|
318
321
|
LLM_KV_TOKENIZER_TOKEN_TYPE,
|
|
319
322
|
LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT,
|
|
@@ -390,6 +393,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
|
|
390
393
|
{ LLM_KV_SSM_TIME_STEP_RANK, "%s.ssm.time_step_rank" },
|
|
391
394
|
|
|
392
395
|
{ LLM_KV_TOKENIZER_MODEL, "tokenizer.ggml.model" },
|
|
396
|
+
{ LLM_KV_TOKENIZER_PRE, "tokenizer.ggml.pre" },
|
|
393
397
|
{ LLM_KV_TOKENIZER_LIST, "tokenizer.ggml.tokens" },
|
|
394
398
|
{ LLM_KV_TOKENIZER_TOKEN_TYPE, "tokenizer.ggml.token_type" },
|
|
395
399
|
{ LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT, "tokenizer.ggml.token_type_count" },
|
|
@@ -793,6 +797,23 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
|
|
|
793
797
|
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
|
794
798
|
},
|
|
795
799
|
},
|
|
800
|
+
{
|
|
801
|
+
LLM_ARCH_PHI3,
|
|
802
|
+
{
|
|
803
|
+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
|
804
|
+
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
|
805
|
+
{ LLM_TENSOR_OUTPUT, "output" },
|
|
806
|
+
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
|
807
|
+
{ LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
|
|
808
|
+
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
|
809
|
+
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
|
810
|
+
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
|
811
|
+
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
|
812
|
+
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
|
813
|
+
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
|
814
|
+
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
|
815
|
+
},
|
|
816
|
+
},
|
|
796
817
|
{
|
|
797
818
|
LLM_ARCH_PLAMO,
|
|
798
819
|
{
|
|
@@ -1824,7 +1845,7 @@ struct llama_hparams {
|
|
|
1824
1845
|
float f_logit_scale = 0.0f;
|
|
1825
1846
|
|
|
1826
1847
|
bool causal_attn = true;
|
|
1827
|
-
bool
|
|
1848
|
+
bool use_alibi = false; // currently, we need KQ_pos data for ALiBi-based models
|
|
1828
1849
|
|
|
1829
1850
|
enum llama_pooling_type pooling_type = LLAMA_POOLING_TYPE_NONE;
|
|
1830
1851
|
enum llama_rope_type rope_type = LLAMA_ROPE_TYPE_NONE;
|
|
@@ -1914,6 +1935,7 @@ struct llama_cparams {
|
|
|
1914
1935
|
bool embeddings;
|
|
1915
1936
|
bool causal_attn;
|
|
1916
1937
|
bool offload_kqv;
|
|
1938
|
+
bool flash_attn;
|
|
1917
1939
|
|
|
1918
1940
|
enum llama_pooling_type pooling_type;
|
|
1919
1941
|
|
|
@@ -2017,8 +2039,8 @@ struct llama_kv_cache {
|
|
|
2017
2039
|
bool has_shift = false;
|
|
2018
2040
|
bool do_defrag = false;
|
|
2019
2041
|
bool do_copy = false;
|
|
2020
|
-
// with recurrent state models, a cell can hold the state for more than one past token
|
|
2021
|
-
bool
|
|
2042
|
+
bool recurrent = false; // with recurrent state models, a cell can hold the state for more than one past token
|
|
2043
|
+
bool v_trans = true; // the value tensor is transposed
|
|
2022
2044
|
|
|
2023
2045
|
// Note: The value of head isn't only used to optimize searching
|
|
2024
2046
|
// for a free KV slot. llama_decode_internal also uses it, so it
|
|
@@ -2095,7 +2117,8 @@ struct llama_vocab {
|
|
|
2095
2117
|
ttype type;
|
|
2096
2118
|
};
|
|
2097
2119
|
|
|
2098
|
-
enum llama_vocab_type
|
|
2120
|
+
enum llama_vocab_type type = LLAMA_VOCAB_TYPE_SPM;
|
|
2121
|
+
enum llama_vocab_pre_type type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
|
|
2099
2122
|
|
|
2100
2123
|
std::unordered_map<token, id> token_to_id;
|
|
2101
2124
|
std::vector<token_data> id_to_token;
|
|
@@ -2316,11 +2339,14 @@ struct llama_context {
|
|
|
2316
2339
|
|
|
2317
2340
|
static bool llama_kv_cache_init(
|
|
2318
2341
|
struct llama_kv_cache & cache,
|
|
2319
|
-
|
|
2342
|
+
const llama_context * ctx,
|
|
2320
2343
|
ggml_type type_k,
|
|
2321
2344
|
ggml_type type_v,
|
|
2322
2345
|
uint32_t kv_size,
|
|
2323
2346
|
bool offload) {
|
|
2347
|
+
const llama_model & model = ctx->model;
|
|
2348
|
+
const llama_cparams & cparams = ctx->cparams;
|
|
2349
|
+
|
|
2324
2350
|
const struct llama_hparams & hparams = model.hparams;
|
|
2325
2351
|
|
|
2326
2352
|
const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa() + hparams.n_embd_k_s();
|
|
@@ -2331,8 +2357,9 @@ static bool llama_kv_cache_init(
|
|
|
2331
2357
|
|
|
2332
2358
|
// TODO: find a nicer way to add other recurrent model architectures
|
|
2333
2359
|
cache.recurrent = model.arch == LLM_ARCH_MAMBA;
|
|
2360
|
+
cache.v_trans = !cparams.flash_attn;
|
|
2334
2361
|
|
|
2335
|
-
// TODO: support mixed
|
|
2362
|
+
// TODO: support mixed recurrent Transformer architectures
|
|
2336
2363
|
// NOTE: (!a || b) is a logical implication (a -> b)
|
|
2337
2364
|
GGML_ASSERT(!cache.recurrent || n_embd_k_gqa == hparams.n_embd_k_s());
|
|
2338
2365
|
GGML_ASSERT(!cache.recurrent || n_embd_v_gqa == hparams.n_embd_v_s());
|
|
@@ -2543,6 +2570,10 @@ static void llama_kv_cache_clear(struct llama_kv_cache & cache) {
|
|
|
2543
2570
|
}
|
|
2544
2571
|
cache.head = 0;
|
|
2545
2572
|
cache.used = 0;
|
|
2573
|
+
|
|
2574
|
+
for (auto & buf : cache.bufs) {
|
|
2575
|
+
ggml_backend_buffer_clear(buf, 0);
|
|
2576
|
+
}
|
|
2546
2577
|
}
|
|
2547
2578
|
|
|
2548
2579
|
static bool llama_kv_cache_seq_rm(
|
|
@@ -2863,6 +2894,7 @@ namespace GGUFMeta {
|
|
|
2863
2894
|
case LLAMA_KV_OVERRIDE_TYPE_BOOL: return "bool";
|
|
2864
2895
|
case LLAMA_KV_OVERRIDE_TYPE_INT: return "int";
|
|
2865
2896
|
case LLAMA_KV_OVERRIDE_TYPE_FLOAT: return "float";
|
|
2897
|
+
case LLAMA_KV_OVERRIDE_TYPE_STR: return "str";
|
|
2866
2898
|
}
|
|
2867
2899
|
return "unknown";
|
|
2868
2900
|
}
|
|
@@ -2874,13 +2906,16 @@ namespace GGUFMeta {
|
|
|
2874
2906
|
__func__, override_type_to_str(ovrd->tag), ovrd->key);
|
|
2875
2907
|
switch (ovrd->tag) {
|
|
2876
2908
|
case LLAMA_KV_OVERRIDE_TYPE_BOOL: {
|
|
2877
|
-
LLAMA_LOG_INFO("%s\n", ovrd->
|
|
2909
|
+
LLAMA_LOG_INFO("%s\n", ovrd->val_bool ? "true" : "false");
|
|
2878
2910
|
} break;
|
|
2879
2911
|
case LLAMA_KV_OVERRIDE_TYPE_INT: {
|
|
2880
|
-
LLAMA_LOG_INFO("%" PRId64 "\n", ovrd->
|
|
2912
|
+
LLAMA_LOG_INFO("%" PRId64 "\n", ovrd->val_i64);
|
|
2881
2913
|
} break;
|
|
2882
2914
|
case LLAMA_KV_OVERRIDE_TYPE_FLOAT: {
|
|
2883
|
-
LLAMA_LOG_INFO("%.6f\n", ovrd->
|
|
2915
|
+
LLAMA_LOG_INFO("%.6f\n", ovrd->val_f64);
|
|
2916
|
+
} break;
|
|
2917
|
+
case LLAMA_KV_OVERRIDE_TYPE_STR: {
|
|
2918
|
+
LLAMA_LOG_INFO("%s\n", ovrd->val_str);
|
|
2884
2919
|
} break;
|
|
2885
2920
|
default:
|
|
2886
2921
|
// Shouldn't be possible to end up here, but just in case...
|
|
@@ -2899,7 +2934,7 @@ namespace GGUFMeta {
|
|
|
2899
2934
|
static typename std::enable_if<std::is_same<OT, bool>::value, bool>::type
|
|
2900
2935
|
try_override(OT & target, const struct llama_model_kv_override * ovrd) {
|
|
2901
2936
|
if (validate_override(LLAMA_KV_OVERRIDE_TYPE_BOOL, ovrd)) {
|
|
2902
|
-
target = ovrd->
|
|
2937
|
+
target = ovrd->val_bool;
|
|
2903
2938
|
return true;
|
|
2904
2939
|
}
|
|
2905
2940
|
return false;
|
|
@@ -2909,7 +2944,7 @@ namespace GGUFMeta {
|
|
|
2909
2944
|
static typename std::enable_if<!std::is_same<OT, bool>::value && std::is_integral<OT>::value, bool>::type
|
|
2910
2945
|
try_override(OT & target, const struct llama_model_kv_override * ovrd) {
|
|
2911
2946
|
if (validate_override(LLAMA_KV_OVERRIDE_TYPE_INT, ovrd)) {
|
|
2912
|
-
target = ovrd->
|
|
2947
|
+
target = ovrd->val_i64;
|
|
2913
2948
|
return true;
|
|
2914
2949
|
}
|
|
2915
2950
|
return false;
|
|
@@ -2919,7 +2954,7 @@ namespace GGUFMeta {
|
|
|
2919
2954
|
static typename std::enable_if<std::is_floating_point<OT>::value, bool>::type
|
|
2920
2955
|
try_override(T & target, const struct llama_model_kv_override * ovrd) {
|
|
2921
2956
|
if (validate_override(LLAMA_KV_OVERRIDE_TYPE_FLOAT, ovrd)) {
|
|
2922
|
-
target = ovrd->
|
|
2957
|
+
target = ovrd->val_f64;
|
|
2923
2958
|
return true;
|
|
2924
2959
|
}
|
|
2925
2960
|
return false;
|
|
@@ -2928,12 +2963,11 @@ namespace GGUFMeta {
|
|
|
2928
2963
|
template<typename OT>
|
|
2929
2964
|
static typename std::enable_if<std::is_same<OT, std::string>::value, bool>::type
|
|
2930
2965
|
try_override(T & target, const struct llama_model_kv_override * ovrd) {
|
|
2931
|
-
(
|
|
2932
|
-
|
|
2933
|
-
|
|
2934
|
-
|
|
2935
|
-
|
|
2936
|
-
ovrd ? ovrd->key : "NULL"));
|
|
2966
|
+
if (validate_override(LLAMA_KV_OVERRIDE_TYPE_STR, ovrd)) {
|
|
2967
|
+
target = ovrd->val_str;
|
|
2968
|
+
return true;
|
|
2969
|
+
}
|
|
2970
|
+
return false;
|
|
2937
2971
|
}
|
|
2938
2972
|
|
|
2939
2973
|
static bool set(const gguf_context * ctx, const int k, T & target, const struct llama_model_kv_override * ovrd = nullptr) {
|
|
@@ -2966,6 +3000,7 @@ struct llama_model_loader {
|
|
|
2966
3000
|
size_t n_bytes = 0;
|
|
2967
3001
|
|
|
2968
3002
|
bool use_mmap = false;
|
|
3003
|
+
bool check_tensors;
|
|
2969
3004
|
|
|
2970
3005
|
llama_files files;
|
|
2971
3006
|
llama_ftype ftype;
|
|
@@ -2980,9 +3015,13 @@ struct llama_model_loader {
|
|
|
2980
3015
|
|
|
2981
3016
|
ggml_tensor * tensor;
|
|
2982
3017
|
|
|
2983
|
-
llama_tensor_weight(uint16_t idx, const char * name, const struct gguf_context * gguf_ctx, ggml_tensor * tensor) : idx(idx), tensor(tensor) {
|
|
3018
|
+
llama_tensor_weight(const llama_file * file, uint16_t idx, const char * name, const struct gguf_context * gguf_ctx, ggml_tensor * tensor) : idx(idx), tensor(tensor) {
|
|
2984
3019
|
const int tensor_idx = gguf_find_tensor(gguf_ctx, name);
|
|
2985
3020
|
offs = gguf_get_data_offset(gguf_ctx) + gguf_get_tensor_offset(gguf_ctx, tensor_idx);
|
|
3021
|
+
|
|
3022
|
+
if (offs + ggml_nbytes(tensor) < offs || offs + ggml_nbytes(tensor) > file->size) {
|
|
3023
|
+
throw std::runtime_error(format("tensor '%s' data is not within the file bounds, model is corrupted or incomplete", name));
|
|
3024
|
+
}
|
|
2986
3025
|
}
|
|
2987
3026
|
};
|
|
2988
3027
|
std::vector<llama_tensor_weight> weights;
|
|
@@ -2995,7 +3034,7 @@ struct llama_model_loader {
|
|
|
2995
3034
|
std::string arch_name;
|
|
2996
3035
|
LLM_KV llm_kv = LLM_KV(LLM_ARCH_UNKNOWN);
|
|
2997
3036
|
|
|
2998
|
-
llama_model_loader(const std::string & fname, bool use_mmap, const struct llama_model_kv_override * param_overrides_p) {
|
|
3037
|
+
llama_model_loader(const std::string & fname, bool use_mmap, bool check_tensors, const struct llama_model_kv_override * param_overrides_p) {
|
|
2999
3038
|
int trace = 0;
|
|
3000
3039
|
if (getenv("LLAMA_TRACE")) {
|
|
3001
3040
|
trace = atoi(getenv("LLAMA_TRACE"));
|
|
@@ -3021,15 +3060,15 @@ struct llama_model_loader {
|
|
|
3021
3060
|
get_key(llm_kv(LLM_KV_GENERAL_ARCHITECTURE), arch_name, false);
|
|
3022
3061
|
llm_kv = LLM_KV(llm_arch_from_string(arch_name));
|
|
3023
3062
|
|
|
3063
|
+
files.emplace_back(new llama_file(fname.c_str(), "rb"));
|
|
3064
|
+
contexts.emplace_back(ctx);
|
|
3065
|
+
|
|
3024
3066
|
// Save tensors data offset of the main file.
|
|
3025
3067
|
// For subsidiary files, `meta` tensor data offset must not be used,
|
|
3026
3068
|
// so we build a unified tensors index for weights.
|
|
3027
3069
|
for (ggml_tensor * cur = ggml_get_first_tensor(ctx); cur; cur = ggml_get_next_tensor(ctx, cur)) {
|
|
3028
|
-
weights.emplace_back(0, cur->name, meta, cur);
|
|
3070
|
+
weights.emplace_back(files.back().get(), 0, cur->name, meta, cur);
|
|
3029
3071
|
}
|
|
3030
|
-
files.emplace_back(new llama_file(fname.c_str(), "rb"));
|
|
3031
|
-
contexts.emplace_back(ctx);
|
|
3032
|
-
|
|
3033
3072
|
uint16_t n_split = 0;
|
|
3034
3073
|
get_key(llm_kv(LLM_KV_SPLIT_COUNT), n_split, false);
|
|
3035
3074
|
|
|
@@ -3063,12 +3102,13 @@ struct llama_model_loader {
|
|
|
3063
3102
|
throw std::runtime_error(format("%s: failed to load GGUF split from %s\n", __func__, split_path));
|
|
3064
3103
|
}
|
|
3065
3104
|
|
|
3105
|
+
files.emplace_back(new llama_file(split_path, "rb"));
|
|
3106
|
+
contexts.emplace_back(ctx);
|
|
3107
|
+
|
|
3066
3108
|
// Save tensors data offset info of the shard.
|
|
3067
3109
|
for (ggml_tensor * cur = ggml_get_first_tensor(ctx); cur; cur = ggml_get_next_tensor(ctx, cur)) {
|
|
3068
|
-
weights.emplace_back(idx, cur->name, ctx_gguf, cur);
|
|
3110
|
+
weights.emplace_back(files.back().get(), idx, cur->name, ctx_gguf, cur);
|
|
3069
3111
|
}
|
|
3070
|
-
files.emplace_back(new llama_file(split_path, "rb"));
|
|
3071
|
-
contexts.emplace_back(ctx);
|
|
3072
3112
|
|
|
3073
3113
|
gguf_free(ctx_gguf);
|
|
3074
3114
|
}
|
|
@@ -3091,9 +3131,17 @@ struct llama_model_loader {
|
|
|
3091
3131
|
|
|
3092
3132
|
fver = (enum llama_fver) gguf_get_version(meta);
|
|
3093
3133
|
|
|
3134
|
+
std::set<std::string> tensor_names;
|
|
3094
3135
|
for (auto & w : weights) {
|
|
3095
3136
|
n_elements += ggml_nelements(w.tensor);
|
|
3096
3137
|
n_bytes += ggml_nbytes(w.tensor);
|
|
3138
|
+
// make sure there is no duplicated tensor names
|
|
3139
|
+
const std::string name(w.tensor->name);
|
|
3140
|
+
auto found = tensor_names.find(name);
|
|
3141
|
+
if (found != tensor_names.end()) {
|
|
3142
|
+
throw std::runtime_error(format("invalid model: tensor '%s' is duplicated", w.tensor->name));
|
|
3143
|
+
}
|
|
3144
|
+
tensor_names.insert(name);
|
|
3097
3145
|
}
|
|
3098
3146
|
|
|
3099
3147
|
LLAMA_LOG_INFO("%s: loaded meta data with %d key-value pairs and %d tensors from %s (version %s)\n",
|
|
@@ -3127,6 +3175,7 @@ struct llama_model_loader {
|
|
|
3127
3175
|
switch (type_max) {
|
|
3128
3176
|
case GGML_TYPE_F32: ftype = LLAMA_FTYPE_ALL_F32; break;
|
|
3129
3177
|
case GGML_TYPE_F16: ftype = LLAMA_FTYPE_MOSTLY_F16; break;
|
|
3178
|
+
case GGML_TYPE_BF16: ftype = LLAMA_FTYPE_MOSTLY_BF16; break;
|
|
3130
3179
|
case GGML_TYPE_Q4_0: ftype = LLAMA_FTYPE_MOSTLY_Q4_0; break;
|
|
3131
3180
|
case GGML_TYPE_Q4_1: ftype = LLAMA_FTYPE_MOSTLY_Q4_1; break;
|
|
3132
3181
|
case GGML_TYPE_Q5_0: ftype = LLAMA_FTYPE_MOSTLY_Q5_0; break;
|
|
@@ -3199,6 +3248,7 @@ struct llama_model_loader {
|
|
|
3199
3248
|
}
|
|
3200
3249
|
|
|
3201
3250
|
this->use_mmap = use_mmap;
|
|
3251
|
+
this->check_tensors = check_tensors;
|
|
3202
3252
|
}
|
|
3203
3253
|
|
|
3204
3254
|
~llama_model_loader() {
|
|
@@ -3278,6 +3328,10 @@ struct llama_model_loader {
|
|
|
3278
3328
|
return nullptr;
|
|
3279
3329
|
}
|
|
3280
3330
|
|
|
3331
|
+
const llama_tensor_weight * get_weight(int i) const {
|
|
3332
|
+
return get_weight(get_tensor_name(i));
|
|
3333
|
+
}
|
|
3334
|
+
|
|
3281
3335
|
const llama_tensor_weight & require_weight(const char * name) const {
|
|
3282
3336
|
const llama_tensor_weight * weight = get_weight(name);
|
|
3283
3337
|
if (!weight) {
|
|
@@ -3453,6 +3507,10 @@ struct llama_model_loader {
|
|
|
3453
3507
|
file->seek(w.offs, SEEK_SET);
|
|
3454
3508
|
file->read_raw(cur->data, ggml_nbytes(cur));
|
|
3455
3509
|
}
|
|
3510
|
+
|
|
3511
|
+
if (check_tensors && !ggml_validate_row_data(cur->type, cur->data, ggml_nbytes(cur))) {
|
|
3512
|
+
throw std::runtime_error(format("tensor '%s' has invalid data", ggml_get_name(cur)));
|
|
3513
|
+
}
|
|
3456
3514
|
}
|
|
3457
3515
|
|
|
3458
3516
|
size_t size_done = 0;
|
|
@@ -3469,6 +3527,8 @@ struct llama_model_loader {
|
|
|
3469
3527
|
GGML_ASSERT(size_data != 0 && "call init_mappings() first");
|
|
3470
3528
|
|
|
3471
3529
|
std::vector<no_init<uint8_t>> read_buf;
|
|
3530
|
+
std::vector<std::future<std::pair<ggml_tensor *, bool>>> validation_result;
|
|
3531
|
+
|
|
3472
3532
|
for (struct ggml_tensor * cur = ggml_get_first_tensor(ctx); cur != NULL; cur = ggml_get_next_tensor(ctx, cur)) {
|
|
3473
3533
|
const auto * weight = get_weight(ggml_get_name(cur));
|
|
3474
3534
|
if (weight == nullptr) {
|
|
@@ -3490,37 +3550,66 @@ struct llama_model_loader {
|
|
|
3490
3550
|
if (bufs_mmap.count(weight->idx)) {
|
|
3491
3551
|
buf_mmap = bufs_mmap.at(weight->idx);
|
|
3492
3552
|
}
|
|
3553
|
+
uint8_t * data = (uint8_t *) mapping->addr + weight->offs;
|
|
3554
|
+
|
|
3555
|
+
if (check_tensors) {
|
|
3556
|
+
validation_result.emplace_back(std::async(std::launch::async, [cur, data, n_size] {
|
|
3557
|
+
return std::make_pair(cur, ggml_validate_row_data(cur->type, data, n_size));
|
|
3558
|
+
}));
|
|
3559
|
+
}
|
|
3560
|
+
|
|
3493
3561
|
GGML_ASSERT(buf_mmap || cur->data); // either we have a buffer to allocate the tensor in, or it is already allocated
|
|
3494
3562
|
if (buf_mmap && cur->data == nullptr) {
|
|
3495
|
-
ggml_backend_tensor_alloc(buf_mmap, cur,
|
|
3563
|
+
ggml_backend_tensor_alloc(buf_mmap, cur, data);
|
|
3496
3564
|
if (lmlocks) {
|
|
3497
3565
|
const auto & lmlock = lmlocks->at(weight->idx);
|
|
3498
|
-
lmlock->grow_to(weight->offs +
|
|
3566
|
+
lmlock->grow_to(weight->offs + n_size);
|
|
3499
3567
|
}
|
|
3500
3568
|
|
|
3501
3569
|
auto & mmap_used = mmaps_used[weight->idx];
|
|
3502
3570
|
mmap_used.first = std::min(mmap_used.first, weight->offs);
|
|
3503
3571
|
mmap_used.second = std::max(mmap_used.second, weight->offs + n_size);
|
|
3504
3572
|
} else {
|
|
3505
|
-
ggml_backend_tensor_set(cur,
|
|
3573
|
+
ggml_backend_tensor_set(cur, data, 0, n_size);
|
|
3506
3574
|
}
|
|
3507
3575
|
} else {
|
|
3508
3576
|
GGML_ASSERT(weight->idx < files.size());
|
|
3509
3577
|
const auto & file = files.at(weight->idx);
|
|
3510
3578
|
if (ggml_backend_buffer_is_host(cur->buffer)) {
|
|
3511
3579
|
file->seek(weight->offs, SEEK_SET);
|
|
3512
|
-
file->read_raw(cur->data,
|
|
3580
|
+
file->read_raw(cur->data, n_size);
|
|
3581
|
+
if (check_tensors) {
|
|
3582
|
+
validation_result.emplace_back(std::async(std::launch::async, [cur, n_size] {
|
|
3583
|
+
return std::make_pair(cur, ggml_validate_row_data(cur->type, cur->data, n_size));
|
|
3584
|
+
}));
|
|
3585
|
+
}
|
|
3513
3586
|
} else {
|
|
3514
|
-
read_buf.resize(
|
|
3587
|
+
read_buf.resize(n_size);
|
|
3515
3588
|
file->seek(weight->offs, SEEK_SET);
|
|
3516
|
-
file->read_raw(read_buf.data(),
|
|
3589
|
+
file->read_raw(read_buf.data(), n_size);
|
|
3517
3590
|
ggml_backend_tensor_set(cur, read_buf.data(), 0, n_size);
|
|
3591
|
+
if (check_tensors && !ggml_validate_row_data(cur->type, read_buf.data(), n_size)) {
|
|
3592
|
+
throw std::runtime_error(format("tensor '%s' has invalid data", ggml_get_name(cur)));
|
|
3593
|
+
}
|
|
3518
3594
|
}
|
|
3519
3595
|
}
|
|
3520
3596
|
|
|
3521
3597
|
size_done += n_size;
|
|
3522
3598
|
}
|
|
3523
3599
|
|
|
3600
|
+
// check validation results
|
|
3601
|
+
bool validation_failed = false;
|
|
3602
|
+
for (auto & future : validation_result) {
|
|
3603
|
+
auto result = future.get();
|
|
3604
|
+
if (!result.second) {
|
|
3605
|
+
LLAMA_LOG_ERROR("%s: tensor '%s' has invalid data\n", __func__, ggml_get_name(result.first));
|
|
3606
|
+
validation_failed = true;
|
|
3607
|
+
}
|
|
3608
|
+
}
|
|
3609
|
+
if (validation_failed) {
|
|
3610
|
+
throw std::runtime_error("found tensors with invalid data");
|
|
3611
|
+
}
|
|
3612
|
+
|
|
3524
3613
|
// check if this is the last call and do final cleanup
|
|
3525
3614
|
if (size_done >= size_data) {
|
|
3526
3615
|
// unmap offloaded tensors and metadata
|
|
@@ -3578,6 +3667,7 @@ static std::string llama_model_ftype_name(llama_ftype ftype) {
|
|
|
3578
3667
|
switch (ftype) {
|
|
3579
3668
|
case LLAMA_FTYPE_ALL_F32: return "all F32";
|
|
3580
3669
|
case LLAMA_FTYPE_MOSTLY_F16: return "F16";
|
|
3670
|
+
case LLAMA_FTYPE_MOSTLY_BF16: return "BF16";
|
|
3581
3671
|
case LLAMA_FTYPE_MOSTLY_Q4_0: return "Q4_0";
|
|
3582
3672
|
case LLAMA_FTYPE_MOSTLY_Q4_1: return "Q4_1";
|
|
3583
3673
|
case LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16:
|
|
@@ -3955,6 +4045,16 @@ static void llm_load_hparams(
|
|
|
3955
4045
|
{
|
|
3956
4046
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
|
3957
4047
|
|
|
4048
|
+
switch (hparams.n_layer) {
|
|
4049
|
+
case 24: model.type = e_model::MODEL_1B; break;
|
|
4050
|
+
case 32: model.type = e_model::MODEL_3B; break;
|
|
4051
|
+
default: model.type = e_model::MODEL_UNKNOWN;
|
|
4052
|
+
}
|
|
4053
|
+
} break;
|
|
4054
|
+
case LLM_ARCH_PHI3:
|
|
4055
|
+
{
|
|
4056
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
4057
|
+
|
|
3958
4058
|
switch (hparams.n_layer) {
|
|
3959
4059
|
case 24: model.type = e_model::MODEL_1B; break;
|
|
3960
4060
|
case 32: model.type = e_model::MODEL_3B; break;
|
|
@@ -4104,7 +4204,7 @@ static void llm_load_hparams(
|
|
|
4104
4204
|
model.ftype = ml.ftype;
|
|
4105
4205
|
|
|
4106
4206
|
if (hparams.f_max_alibi_bias > 0.0f) {
|
|
4107
|
-
hparams.
|
|
4207
|
+
hparams.use_alibi = true;
|
|
4108
4208
|
}
|
|
4109
4209
|
|
|
4110
4210
|
hparams.rope_type = llama_rope_type(&model);
|
|
@@ -4127,11 +4227,13 @@ static void llm_load_vocab(
|
|
|
4127
4227
|
|
|
4128
4228
|
// determine vocab type
|
|
4129
4229
|
{
|
|
4130
|
-
std::string
|
|
4230
|
+
std::string tokenizer_model;
|
|
4231
|
+
std::string tokenizer_pre;
|
|
4131
4232
|
|
|
4132
|
-
ml.get_key(LLM_KV_TOKENIZER_MODEL,
|
|
4233
|
+
ml.get_key(LLM_KV_TOKENIZER_MODEL, tokenizer_model);
|
|
4234
|
+
ml.get_key(LLM_KV_TOKENIZER_PRE, tokenizer_pre, false);
|
|
4133
4235
|
|
|
4134
|
-
if (
|
|
4236
|
+
if (tokenizer_model == "no_vocab") {
|
|
4135
4237
|
vocab.type = LLAMA_VOCAB_TYPE_NONE;
|
|
4136
4238
|
|
|
4137
4239
|
// default special tokens
|
|
@@ -4145,7 +4247,7 @@ static void llm_load_vocab(
|
|
|
4145
4247
|
vocab.linefeed_id = -1;
|
|
4146
4248
|
|
|
4147
4249
|
return;
|
|
4148
|
-
} else if (
|
|
4250
|
+
} else if (tokenizer_model == "llama") {
|
|
4149
4251
|
vocab.type = LLAMA_VOCAB_TYPE_SPM;
|
|
4150
4252
|
|
|
4151
4253
|
// default special tokens
|
|
@@ -4190,9 +4292,27 @@ static void llm_load_vocab(
|
|
|
4190
4292
|
if (add_space_prefix_keyidx != -1) {
|
|
4191
4293
|
vocab.add_space_prefix = gguf_get_val_bool(ctx, add_space_prefix_keyidx);
|
|
4192
4294
|
} // The default value of add_space_prefix is true.
|
|
4193
|
-
} else if (
|
|
4194
|
-
vocab.type =
|
|
4295
|
+
} else if (tokenizer_model == "bert") {
|
|
4296
|
+
vocab.type = LLAMA_VOCAB_TYPE_WPM;
|
|
4195
4297
|
|
|
4298
|
+
// default special tokens
|
|
4299
|
+
vocab.special_bos_id = -1;
|
|
4300
|
+
vocab.special_eos_id = -1;
|
|
4301
|
+
vocab.special_unk_id = 100;
|
|
4302
|
+
vocab.special_sep_id = 102;
|
|
4303
|
+
vocab.special_pad_id = 0;
|
|
4304
|
+
vocab.special_cls_id = 101;
|
|
4305
|
+
vocab.special_mask_id = 103;
|
|
4306
|
+
vocab.add_space_prefix = false;
|
|
4307
|
+
} else {
|
|
4308
|
+
if (tokenizer_model == "gpt2") {
|
|
4309
|
+
vocab.type = LLAMA_VOCAB_TYPE_BPE;
|
|
4310
|
+
} else {
|
|
4311
|
+
LLAMA_LOG_WARN("%s: unknown tokenizer: '%s'", __func__, tokenizer_model.c_str());
|
|
4312
|
+
LLAMA_LOG_WARN("%s: using default tokenizer: 'llama'", __func__);
|
|
4313
|
+
vocab.type = LLAMA_VOCAB_TYPE_SPM;
|
|
4314
|
+
return;
|
|
4315
|
+
}
|
|
4196
4316
|
// read bpe merges and populate bpe ranks
|
|
4197
4317
|
const int merges_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_MERGES).c_str());
|
|
4198
4318
|
if (merges_keyidx == -1) {
|
|
@@ -4226,23 +4346,65 @@ static void llm_load_vocab(
|
|
|
4226
4346
|
vocab.special_pad_id = -1;
|
|
4227
4347
|
vocab.special_cls_id = -1;
|
|
4228
4348
|
vocab.special_mask_id = -1;
|
|
4229
|
-
}
|
|
4230
|
-
vocab.type = LLAMA_VOCAB_TYPE_WPM;
|
|
4349
|
+
}
|
|
4231
4350
|
|
|
4232
|
-
|
|
4233
|
-
|
|
4234
|
-
|
|
4235
|
-
|
|
4236
|
-
|
|
4237
|
-
|
|
4238
|
-
|
|
4239
|
-
|
|
4240
|
-
|
|
4351
|
+
// for now, only BPE models have pre-tokenizers
|
|
4352
|
+
if (vocab.type == LLAMA_VOCAB_TYPE_BPE) {
|
|
4353
|
+
if (tokenizer_pre.empty()) {
|
|
4354
|
+
LLAMA_LOG_WARN("%s: missing pre-tokenizer type, using: 'default'\n", __func__);
|
|
4355
|
+
LLAMA_LOG_WARN("%s: \n", __func__);
|
|
4356
|
+
LLAMA_LOG_WARN("%s: ************************************ \n", __func__);
|
|
4357
|
+
LLAMA_LOG_WARN("%s: GENERATION QUALITY WILL BE DEGRADED! \n", __func__);
|
|
4358
|
+
LLAMA_LOG_WARN("%s: CONSIDER REGENERATING THE MODEL \n", __func__);
|
|
4359
|
+
LLAMA_LOG_WARN("%s: ************************************ \n", __func__);
|
|
4360
|
+
LLAMA_LOG_WARN("%s: \n", __func__);
|
|
4361
|
+
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
|
|
4362
|
+
} else if (
|
|
4363
|
+
tokenizer_pre == "default") {
|
|
4364
|
+
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
|
|
4365
|
+
} else if (
|
|
4366
|
+
tokenizer_pre == "llama3" ||
|
|
4367
|
+
tokenizer_pre == "llama-v3" ||
|
|
4368
|
+
tokenizer_pre == "llama-bpe") {
|
|
4369
|
+
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_LLAMA3;
|
|
4370
|
+
} else if (
|
|
4371
|
+
tokenizer_pre == "deepseek-llm") {
|
|
4372
|
+
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM;
|
|
4373
|
+
} else if (
|
|
4374
|
+
tokenizer_pre == "deepseek-coder") {
|
|
4375
|
+
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER;
|
|
4376
|
+
} else if (
|
|
4377
|
+
tokenizer_pre == "falcon") {
|
|
4378
|
+
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_FALCON;
|
|
4379
|
+
} else if (
|
|
4380
|
+
tokenizer_pre == "mpt") {
|
|
4381
|
+
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_MPT;
|
|
4382
|
+
} else if (
|
|
4383
|
+
tokenizer_pre == "starcoder") {
|
|
4384
|
+
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_STARCODER;
|
|
4385
|
+
} else if (
|
|
4386
|
+
tokenizer_pre == "gpt-2") {
|
|
4387
|
+
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_GPT2;
|
|
4388
|
+
} else if (
|
|
4389
|
+
tokenizer_pre == "refact") {
|
|
4390
|
+
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_REFACT;
|
|
4391
|
+
} else if (
|
|
4392
|
+
tokenizer_pre == "command-r") {
|
|
4393
|
+
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_COMMAND_R;
|
|
4394
|
+
} else if (
|
|
4395
|
+
tokenizer_pre == "qwen2") {
|
|
4396
|
+
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_QWEN2;
|
|
4397
|
+
} else if (
|
|
4398
|
+
tokenizer_pre == "olmo") {
|
|
4399
|
+
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_OLMO;
|
|
4400
|
+
} else if (
|
|
4401
|
+
tokenizer_pre == "dbrx") {
|
|
4402
|
+
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DBRX;
|
|
4403
|
+
} else {
|
|
4404
|
+
throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
|
|
4405
|
+
}
|
|
4241
4406
|
} else {
|
|
4242
|
-
|
|
4243
|
-
LLAMA_LOG_WARN("%s: using default tokenizer: 'llama'", __func__);
|
|
4244
|
-
|
|
4245
|
-
vocab.type = LLAMA_VOCAB_TYPE_SPM;
|
|
4407
|
+
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
|
|
4246
4408
|
}
|
|
4247
4409
|
}
|
|
4248
4410
|
|
|
@@ -4352,6 +4514,7 @@ static void llm_load_vocab(
|
|
|
4352
4514
|
//vocab.id_to_token[t.second].type == LLAMA_TOKEN_TYPE_CONTROL &&
|
|
4353
4515
|
(t.first == "<|eot_id|>" ||
|
|
4354
4516
|
t.first == "<|im_end|>" ||
|
|
4517
|
+
t.first == "<|end|>" ||
|
|
4355
4518
|
t.first == "<end_of_turn>"
|
|
4356
4519
|
)
|
|
4357
4520
|
) {
|
|
@@ -5375,6 +5538,33 @@ static bool llm_load_tensors(
|
|
|
5375
5538
|
layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff});
|
|
5376
5539
|
}
|
|
5377
5540
|
} break;
|
|
5541
|
+
case LLM_ARCH_PHI3:
|
|
5542
|
+
{
|
|
5543
|
+
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab });
|
|
5544
|
+
|
|
5545
|
+
// output
|
|
5546
|
+
{
|
|
5547
|
+
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd });
|
|
5548
|
+
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), { n_embd, n_vocab });
|
|
5549
|
+
}
|
|
5550
|
+
|
|
5551
|
+
for (int i = 0; i < n_layer; ++i) {
|
|
5552
|
+
ggml_context* ctx_layer = ctx_for_layer(i);
|
|
5553
|
+
ggml_context* ctx_split = ctx_for_layer_split(i);
|
|
5554
|
+
|
|
5555
|
+
auto& layer = model.layers[i];
|
|
5556
|
+
|
|
5557
|
+
layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), { n_embd });
|
|
5558
|
+
|
|
5559
|
+
layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), { n_embd, n_embd + 2 * n_embd_gqa }, false);
|
|
5560
|
+
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd, n_embd });
|
|
5561
|
+
|
|
5562
|
+
layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), { n_embd });
|
|
5563
|
+
|
|
5564
|
+
layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd });
|
|
5565
|
+
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), { n_embd, 2 * n_ff });
|
|
5566
|
+
}
|
|
5567
|
+
} break;
|
|
5378
5568
|
case LLM_ARCH_PLAMO:
|
|
5379
5569
|
{
|
|
5380
5570
|
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
|
@@ -5909,7 +6099,7 @@ static bool llm_load_tensors(
|
|
|
5909
6099
|
// Returns 0 on success, -1 on error, and -2 on cancellation via llama_progress_callback
|
|
5910
6100
|
static int llama_model_load(const std::string & fname, llama_model & model, llama_model_params & params) {
|
|
5911
6101
|
try {
|
|
5912
|
-
llama_model_loader ml(fname, params.use_mmap, params.kv_overrides);
|
|
6102
|
+
llama_model_loader ml(fname, params.use_mmap, params.check_tensors, params.kv_overrides);
|
|
5913
6103
|
|
|
5914
6104
|
model.hparams.vocab_only = params.vocab_only;
|
|
5915
6105
|
|
|
@@ -5947,6 +6137,7 @@ static int llama_model_load(const std::string & fname, llama_model & model, llam
|
|
|
5947
6137
|
|| !(
|
|
5948
6138
|
model.ftype == LLAMA_FTYPE_ALL_F32 ||
|
|
5949
6139
|
model.ftype == LLAMA_FTYPE_MOSTLY_F16 ||
|
|
6140
|
+
model.ftype == LLAMA_FTYPE_MOSTLY_BF16 ||
|
|
5950
6141
|
model.ftype == LLAMA_FTYPE_MOSTLY_Q4_0 ||
|
|
5951
6142
|
model.ftype == LLAMA_FTYPE_MOSTLY_Q4_1
|
|
5952
6143
|
)
|
|
@@ -6038,37 +6229,47 @@ static struct ggml_tensor * llm_build_inp_embd(
|
|
|
6038
6229
|
static void llm_build_kv_store(
|
|
6039
6230
|
struct ggml_context * ctx,
|
|
6040
6231
|
const llama_hparams & hparams,
|
|
6232
|
+
const llama_cparams & cparams,
|
|
6041
6233
|
const llama_kv_cache & kv,
|
|
6042
6234
|
struct ggml_cgraph * graph,
|
|
6043
6235
|
struct ggml_tensor * k_cur,
|
|
6044
6236
|
struct ggml_tensor * v_cur,
|
|
6045
|
-
int64_t n_ctx,
|
|
6046
6237
|
int32_t n_tokens,
|
|
6047
6238
|
int32_t kv_head,
|
|
6048
6239
|
const llm_build_cb & cb,
|
|
6049
6240
|
int64_t il) {
|
|
6241
|
+
const int64_t n_ctx = cparams.n_ctx;
|
|
6242
|
+
|
|
6050
6243
|
const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa();
|
|
6051
6244
|
const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa();
|
|
6052
6245
|
|
|
6053
6246
|
GGML_ASSERT(kv.size == n_ctx);
|
|
6054
6247
|
|
|
6055
|
-
// compute the transposed [n_tokens, n_embd] V matrix
|
|
6056
|
-
assert(v_cur->ne[0] == n_embd_v_gqa && v_cur->ne[1] == n_tokens);
|
|
6057
|
-
struct ggml_tensor * v_cur_t = ggml_transpose(ctx, v_cur);
|
|
6058
|
-
cb(v_cur_t, "v_cur_t", il);
|
|
6059
|
-
|
|
6060
6248
|
struct ggml_tensor * k_cache_view = ggml_view_1d(ctx, kv.k_l[il], n_tokens*n_embd_k_gqa,
|
|
6061
6249
|
(ggml_row_size(kv.k_l[il]->type, n_embd_k_gqa))*kv_head);
|
|
6062
6250
|
cb(k_cache_view, "k_cache_view", il);
|
|
6063
6251
|
|
|
6064
|
-
|
|
6065
|
-
|
|
6066
|
-
|
|
6252
|
+
// note: storing RoPE-ed version of K in the KV cache
|
|
6253
|
+
ggml_build_forward_expand(graph, ggml_cpy(ctx, k_cur, k_cache_view));
|
|
6254
|
+
|
|
6255
|
+
assert(v_cur->ne[0] == n_embd_v_gqa && v_cur->ne[1] == n_tokens);
|
|
6256
|
+
|
|
6257
|
+
struct ggml_tensor * v_cache_view = nullptr;
|
|
6258
|
+
|
|
6259
|
+
if (cparams.flash_attn) {
|
|
6260
|
+
v_cache_view = ggml_view_1d(ctx, kv.v_l[il], n_tokens*n_embd_v_gqa,
|
|
6261
|
+
(kv_head)*ggml_row_size(kv.v_l[il]->type, n_embd_v_gqa));
|
|
6262
|
+
} else {
|
|
6263
|
+
// note: the V cache is transposed when not using flash attention
|
|
6264
|
+
v_cache_view = ggml_view_2d(ctx, kv.v_l[il], n_tokens, n_embd_v_gqa,
|
|
6265
|
+
( n_ctx)*ggml_element_size(kv.v_l[il]),
|
|
6266
|
+
(kv_head)*ggml_element_size(kv.v_l[il]));
|
|
6267
|
+
|
|
6268
|
+
v_cur = ggml_transpose(ctx, v_cur);
|
|
6269
|
+
}
|
|
6067
6270
|
cb(v_cache_view, "v_cache_view", il);
|
|
6068
6271
|
|
|
6069
|
-
|
|
6070
|
-
ggml_build_forward_expand(graph, ggml_cpy(ctx, k_cur, k_cache_view));
|
|
6071
|
-
ggml_build_forward_expand(graph, ggml_cpy(ctx, v_cur_t, v_cache_view));
|
|
6272
|
+
ggml_build_forward_expand(graph, ggml_cpy(ctx, v_cur, v_cache_view));
|
|
6072
6273
|
}
|
|
6073
6274
|
|
|
6074
6275
|
static struct ggml_tensor * llm_build_norm(
|
|
@@ -6288,11 +6489,11 @@ static struct ggml_tensor * llm_build_moe_ffn(
|
|
|
6288
6489
|
return moe_out;
|
|
6289
6490
|
}
|
|
6290
6491
|
|
|
6291
|
-
// if max_alibi_bias > 0 then apply ALiBi
|
|
6292
6492
|
static struct ggml_tensor * llm_build_kqv(
|
|
6293
6493
|
struct ggml_context * ctx,
|
|
6294
6494
|
const llama_model & model,
|
|
6295
6495
|
const llama_hparams & hparams,
|
|
6496
|
+
const llama_cparams & cparams,
|
|
6296
6497
|
const llama_kv_cache & kv,
|
|
6297
6498
|
struct ggml_cgraph * graph,
|
|
6298
6499
|
struct ggml_tensor * wo,
|
|
@@ -6300,12 +6501,12 @@ static struct ggml_tensor * llm_build_kqv(
|
|
|
6300
6501
|
struct ggml_tensor * q_cur,
|
|
6301
6502
|
struct ggml_tensor * kq_mask,
|
|
6302
6503
|
struct ggml_tensor * kq_pos,
|
|
6303
|
-
int64_t n_ctx,
|
|
6304
6504
|
int32_t n_tokens,
|
|
6305
6505
|
int32_t n_kv,
|
|
6306
6506
|
float kq_scale,
|
|
6307
6507
|
const llm_build_cb & cb,
|
|
6308
6508
|
int il) {
|
|
6509
|
+
const int64_t n_ctx = cparams.n_ctx;
|
|
6309
6510
|
const int64_t n_head = hparams.n_head;
|
|
6310
6511
|
const int64_t n_head_kv = hparams.n_head_kv;
|
|
6311
6512
|
const int64_t n_embd_head_k = hparams.n_embd_head_k;
|
|
@@ -6323,71 +6524,99 @@ static struct ggml_tensor * llm_build_kqv(
|
|
|
6323
6524
|
0);
|
|
6324
6525
|
cb(k, "k", il);
|
|
6325
6526
|
|
|
6326
|
-
struct ggml_tensor *
|
|
6327
|
-
cb(kq, "kq", il);
|
|
6527
|
+
struct ggml_tensor * cur;
|
|
6328
6528
|
|
|
6329
|
-
if (
|
|
6330
|
-
|
|
6331
|
-
|
|
6332
|
-
ggml_mul_mat_set_prec(kq, GGML_PREC_F32);
|
|
6333
|
-
}
|
|
6529
|
+
if (cparams.flash_attn) {
|
|
6530
|
+
GGML_UNUSED(model);
|
|
6531
|
+
GGML_UNUSED(n_ctx);
|
|
6334
6532
|
|
|
6335
|
-
|
|
6336
|
-
//
|
|
6337
|
-
|
|
6338
|
-
// and then :
|
|
6339
|
-
// kq = 30 * tanh(kq / 30)
|
|
6340
|
-
// before the softmax below
|
|
6533
|
+
// note: if this assert triggers, then some check has failed earlier
|
|
6534
|
+
// the idea is to detect during context creation that ALiBi would be used and disable Flash Attention
|
|
6535
|
+
GGML_ASSERT(kq_pos == nullptr && "ALiBi is not yet supported with Flash Attention");
|
|
6341
6536
|
|
|
6342
|
-
//
|
|
6343
|
-
|
|
6537
|
+
// split cached v into n_head heads (not transposed)
|
|
6538
|
+
struct ggml_tensor * v =
|
|
6539
|
+
ggml_view_3d(ctx, kv.v_l[il],
|
|
6540
|
+
n_embd_head_v, n_kv, n_head_kv,
|
|
6541
|
+
ggml_row_size(kv.v_l[il]->type, n_embd_k_gqa),
|
|
6542
|
+
ggml_row_size(kv.v_l[il]->type, n_embd_head_k),
|
|
6543
|
+
0);
|
|
6544
|
+
cb(v, "v", il);
|
|
6344
6545
|
|
|
6345
|
-
|
|
6346
|
-
|
|
6347
|
-
|
|
6546
|
+
cur = ggml_flash_attn_ext(ctx, q, k, v, kq_mask, kq_scale);
|
|
6547
|
+
|
|
6548
|
+
if (model.arch == LLM_ARCH_PHI2 || model.arch == LLM_ARCH_PHI3) {
|
|
6549
|
+
ggml_flash_attn_ext_set_prec(cur, GGML_PREC_F32);
|
|
6550
|
+
}
|
|
6551
|
+
|
|
6552
|
+
cur = ggml_reshape_2d(ctx, cur, n_embd_head_k*n_head, n_tokens);
|
|
6553
|
+
} else {
|
|
6554
|
+
struct ggml_tensor * kq = ggml_mul_mat(ctx, k, q);
|
|
6555
|
+
cb(kq, "kq", il);
|
|
6556
|
+
|
|
6557
|
+
if (model.arch == LLM_ARCH_PHI2 || model.arch == LLM_ARCH_PHI3) {
|
|
6558
|
+
// for this arch, we need to perform the KQ multiplication with F32 precision, otherwise we get NaNs
|
|
6559
|
+
// ref: https://github.com/ggerganov/llama.cpp/pull/4490#issuecomment-1859055847
|
|
6560
|
+
ggml_mul_mat_set_prec(kq, GGML_PREC_F32);
|
|
6561
|
+
}
|
|
6562
|
+
|
|
6563
|
+
if (model.arch == LLM_ARCH_GROK) {
|
|
6564
|
+
// need to do the following:
|
|
6565
|
+
// multiply by attn_output_multiplyer of 0.08838834764831845
|
|
6566
|
+
// and then :
|
|
6567
|
+
// kq = 30 * tanh(kq / 30)
|
|
6568
|
+
// before the softmax below
|
|
6569
|
+
|
|
6570
|
+
//try from phi2
|
|
6571
|
+
//ggml_mul_mat_set_prec(kq, GGML_PREC_F32);
|
|
6572
|
+
|
|
6573
|
+
kq = ggml_tanh(ctx, ggml_scale(ctx, kq, 0.08838834764831845f/30.0f));
|
|
6574
|
+
kq = ggml_scale(ctx, kq, 30);
|
|
6575
|
+
}
|
|
6348
6576
|
|
|
6349
6577
|
#if defined(GGML_USE_KOMPUTE)
|
|
6350
6578
|
#pragma message("TODO: ALiBi support in ggml_soft_max_ext is not implemented for Kompute")
|
|
6351
6579
|
#pragma message(" Falling back to ggml_alibi(). Will become an error in Mar 2024")
|
|
6352
6580
|
#pragma message("ref: https://github.com/ggerganov/llama.cpp/pull/5488")
|
|
6353
|
-
|
|
6354
|
-
|
|
6355
|
-
|
|
6581
|
+
if (hparams.use_alibi) {
|
|
6582
|
+
kq = ggml_scale(ctx, kq, kq_scale);
|
|
6583
|
+
cb(kq, "kq_scaled", il);
|
|
6356
6584
|
|
|
6357
|
-
|
|
6358
|
-
|
|
6585
|
+
kq = ggml_alibi(ctx, kq, /*n_past*/ 0, n_head, hparams.f_max_alibi_bias);
|
|
6586
|
+
cb(kq, "kq_scaled_alibi", il);
|
|
6359
6587
|
|
|
6360
|
-
|
|
6361
|
-
|
|
6588
|
+
kq = ggml_add(ctx, kq, kq_mask);
|
|
6589
|
+
cb(kq, "kq_masked", il);
|
|
6362
6590
|
|
|
6363
|
-
|
|
6364
|
-
|
|
6365
|
-
|
|
6591
|
+
kq = ggml_soft_max(ctx, kq);
|
|
6592
|
+
cb(kq, "kq_soft_max", il);
|
|
6593
|
+
} else
|
|
6366
6594
|
#endif
|
|
6367
|
-
|
|
6368
|
-
|
|
6369
|
-
|
|
6370
|
-
|
|
6595
|
+
{
|
|
6596
|
+
kq = ggml_soft_max_ext(ctx, kq, kq_mask, kq_pos, kq_scale, hparams.f_max_alibi_bias);
|
|
6597
|
+
cb(kq, "kq_soft_max_ext", il);
|
|
6598
|
+
}
|
|
6371
6599
|
|
|
6372
|
-
|
|
6600
|
+
GGML_ASSERT(kv.size == n_ctx);
|
|
6373
6601
|
|
|
6374
|
-
|
|
6375
|
-
|
|
6376
|
-
|
|
6377
|
-
|
|
6378
|
-
|
|
6379
|
-
|
|
6380
|
-
|
|
6381
|
-
|
|
6602
|
+
// split cached v into n_head heads
|
|
6603
|
+
struct ggml_tensor * v =
|
|
6604
|
+
ggml_view_3d(ctx, kv.v_l[il],
|
|
6605
|
+
n_kv, n_embd_head_v, n_head_kv,
|
|
6606
|
+
ggml_element_size(kv.v_l[il])*n_ctx,
|
|
6607
|
+
ggml_element_size(kv.v_l[il])*n_ctx*n_embd_head_v,
|
|
6608
|
+
0);
|
|
6609
|
+
cb(v, "v", il);
|
|
6382
6610
|
|
|
6383
|
-
|
|
6384
|
-
|
|
6611
|
+
struct ggml_tensor * kqv = ggml_mul_mat(ctx, v, kq);
|
|
6612
|
+
cb(kqv, "kqv", il);
|
|
6385
6613
|
|
|
6386
|
-
|
|
6387
|
-
|
|
6614
|
+
struct ggml_tensor * kqv_merged = ggml_permute(ctx, kqv, 0, 2, 1, 3);
|
|
6615
|
+
cb(kqv_merged, "kqv_merged", il);
|
|
6388
6616
|
|
|
6389
|
-
|
|
6390
|
-
|
|
6617
|
+
cur = ggml_cont_2d(ctx, kqv_merged, n_embd_head_k*n_head, n_tokens);
|
|
6618
|
+
cb(cur, "kqv_merged_cont", il);
|
|
6619
|
+
}
|
|
6391
6620
|
|
|
6392
6621
|
ggml_build_forward_expand(graph, cur);
|
|
6393
6622
|
|
|
@@ -6407,6 +6636,7 @@ static struct ggml_tensor * llm_build_kv(
|
|
|
6407
6636
|
struct ggml_context * ctx,
|
|
6408
6637
|
const llama_model & model,
|
|
6409
6638
|
const llama_hparams & hparams,
|
|
6639
|
+
const llama_cparams & cparams,
|
|
6410
6640
|
const llama_kv_cache & kv,
|
|
6411
6641
|
struct ggml_cgraph * graph,
|
|
6412
6642
|
struct ggml_tensor * wo,
|
|
@@ -6416,7 +6646,6 @@ static struct ggml_tensor * llm_build_kv(
|
|
|
6416
6646
|
struct ggml_tensor * q_cur,
|
|
6417
6647
|
struct ggml_tensor * kq_mask,
|
|
6418
6648
|
struct ggml_tensor * kq_pos,
|
|
6419
|
-
int64_t n_ctx,
|
|
6420
6649
|
int32_t n_tokens,
|
|
6421
6650
|
int32_t kv_head,
|
|
6422
6651
|
int32_t n_kv,
|
|
@@ -6430,12 +6659,12 @@ static struct ggml_tensor * llm_build_kv(
|
|
|
6430
6659
|
ggml_build_forward_expand(graph, k_cur);
|
|
6431
6660
|
ggml_build_forward_expand(graph, v_cur);
|
|
6432
6661
|
|
|
6433
|
-
llm_build_kv_store(ctx, hparams, kv, graph, k_cur, v_cur,
|
|
6662
|
+
llm_build_kv_store(ctx, hparams, cparams, kv, graph, k_cur, v_cur, n_tokens, kv_head, cb, il);
|
|
6434
6663
|
|
|
6435
6664
|
struct ggml_tensor * cur;
|
|
6436
6665
|
|
|
6437
|
-
cur = llm_build_kqv(ctx, model, hparams, kv, graph, wo, wo_b,
|
|
6438
|
-
q_cur, kq_mask, kq_pos,
|
|
6666
|
+
cur = llm_build_kqv(ctx, model, hparams, cparams, kv, graph, wo, wo_b,
|
|
6667
|
+
q_cur, kq_mask, kq_pos, n_tokens, n_kv, kq_scale, cb, il);
|
|
6439
6668
|
cb(cur, "kqv_out", il);
|
|
6440
6669
|
|
|
6441
6670
|
return cur;
|
|
@@ -6477,6 +6706,8 @@ struct llm_build_context {
|
|
|
6477
6706
|
const int32_t kv_head; // index of where we store new KV data in the cache
|
|
6478
6707
|
const int32_t n_orig_ctx;
|
|
6479
6708
|
|
|
6709
|
+
const bool flash_attn;
|
|
6710
|
+
|
|
6480
6711
|
const enum llama_pooling_type pooling_type;
|
|
6481
6712
|
const enum llama_rope_type rope_type;
|
|
6482
6713
|
|
|
@@ -6523,6 +6754,7 @@ struct llm_build_context {
|
|
|
6523
6754
|
n_outputs (worst_case ? n_tokens : lctx.n_outputs),
|
|
6524
6755
|
kv_head (worst_case ? (kv_self.recurrent ? 0 : kv_self.size - n_tokens) : kv_self.head),
|
|
6525
6756
|
n_orig_ctx (cparams.n_yarn_orig_ctx),
|
|
6757
|
+
flash_attn (cparams.flash_attn),
|
|
6526
6758
|
pooling_type (cparams.pooling_type),
|
|
6527
6759
|
rope_type (hparams.rope_type),
|
|
6528
6760
|
cb (cb),
|
|
@@ -6637,15 +6869,31 @@ struct llm_build_context {
|
|
|
6637
6869
|
ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa),
|
|
6638
6870
|
ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*id));
|
|
6639
6871
|
|
|
6640
|
-
ggml_tensor * view_v_src
|
|
6641
|
-
|
|
6642
|
-
|
|
6643
|
-
|
|
6872
|
+
ggml_tensor * view_v_src;
|
|
6873
|
+
ggml_tensor * view_v_dst;
|
|
6874
|
+
|
|
6875
|
+
if (flash_attn) {
|
|
6876
|
+
// NOTE: the V cache is not transposed when using flash attention
|
|
6877
|
+
view_v_src = ggml_view_2d(ctx0, kv_self.v_l[il],
|
|
6878
|
+
n_embd_v_gqa, nm,
|
|
6879
|
+
ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa),
|
|
6880
|
+
ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa*i));
|
|
6644
6881
|
|
|
6645
|
-
|
|
6646
|
-
|
|
6647
|
-
|
|
6648
|
-
|
|
6882
|
+
view_v_dst = ggml_view_2d(ctx0, kv_self.v_l[il],
|
|
6883
|
+
n_embd_v_gqa, nm,
|
|
6884
|
+
ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa),
|
|
6885
|
+
ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa*id));
|
|
6886
|
+
} else {
|
|
6887
|
+
view_v_src = ggml_view_2d(ctx0, kv_self.v_l[il],
|
|
6888
|
+
nm, n_embd_v_gqa,
|
|
6889
|
+
ggml_row_size(kv_self.v_l[il]->type, kv_self.size),
|
|
6890
|
+
ggml_row_size(kv_self.v_l[il]->type, i));
|
|
6891
|
+
|
|
6892
|
+
view_v_dst = ggml_view_2d(ctx0, kv_self.v_l[il],
|
|
6893
|
+
nm, n_embd_v_gqa,
|
|
6894
|
+
ggml_row_size(kv_self.v_l[il]->type, kv_self.size),
|
|
6895
|
+
ggml_row_size(kv_self.v_l[il]->type, id));
|
|
6896
|
+
}
|
|
6649
6897
|
|
|
6650
6898
|
ggml_build_forward_expand(gf, ggml_cpy(ctx0, view_k_src, view_k_dst));
|
|
6651
6899
|
ggml_build_forward_expand(gf, ggml_cpy(ctx0, view_v_src, view_v_dst));
|
|
@@ -6675,20 +6923,26 @@ struct llm_build_context {
|
|
|
6675
6923
|
|
|
6676
6924
|
struct ggml_tensor * build_inp_KQ_mask(bool causal = true) {
|
|
6677
6925
|
if (causal) {
|
|
6678
|
-
lctx.inp_KQ_mask = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_kv,
|
|
6926
|
+
lctx.inp_KQ_mask = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_kv, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD));
|
|
6679
6927
|
} else {
|
|
6680
|
-
lctx.inp_KQ_mask = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_tokens, n_tokens);
|
|
6928
|
+
lctx.inp_KQ_mask = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_tokens, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD));
|
|
6681
6929
|
}
|
|
6682
6930
|
cb(lctx.inp_KQ_mask, "KQ_mask", -1);
|
|
6683
6931
|
ggml_set_input(lctx.inp_KQ_mask);
|
|
6684
|
-
return lctx.inp_KQ_mask;
|
|
6932
|
+
return flash_attn ? ggml_cast(ctx0, lctx.inp_KQ_mask, GGML_TYPE_F16) : lctx.inp_KQ_mask;
|
|
6685
6933
|
}
|
|
6686
6934
|
|
|
6687
|
-
struct ggml_tensor * build_inp_KQ_pos() {
|
|
6688
|
-
|
|
6935
|
+
struct ggml_tensor * build_inp_KQ_pos(bool causal = true) {
|
|
6936
|
+
if (causal) {
|
|
6937
|
+
lctx.inp_KQ_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, n_kv);
|
|
6938
|
+
} else {
|
|
6939
|
+
// TODO: this will be needed for ALiBi-based BERT models
|
|
6940
|
+
// https://github.com/ggerganov/llama.cpp/pull/6826
|
|
6941
|
+
lctx.inp_KQ_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, n_tokens);
|
|
6942
|
+
}
|
|
6689
6943
|
cb(lctx.inp_KQ_pos, "KQ_pos", -1);
|
|
6690
6944
|
ggml_set_input(lctx.inp_KQ_pos);
|
|
6691
|
-
return lctx.inp_KQ_pos;
|
|
6945
|
+
return flash_attn ? ggml_cast(ctx0, lctx.inp_KQ_pos, GGML_TYPE_F16) : lctx.inp_KQ_pos;
|
|
6692
6946
|
}
|
|
6693
6947
|
|
|
6694
6948
|
struct ggml_tensor * build_inp_mean() {
|
|
@@ -6794,9 +7048,9 @@ struct llm_build_context {
|
|
|
6794
7048
|
);
|
|
6795
7049
|
cb(Kcur, "Kcur", il);
|
|
6796
7050
|
|
|
6797
|
-
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
|
7051
|
+
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
|
6798
7052
|
model.layers[il].wo, model.layers[il].bo,
|
|
6799
|
-
Kcur, Vcur, Qcur, KQ_mask, nullptr,
|
|
7053
|
+
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
|
6800
7054
|
}
|
|
6801
7055
|
|
|
6802
7056
|
if (il == n_layer - 1) {
|
|
@@ -6934,9 +7188,9 @@ struct llm_build_context {
|
|
|
6934
7188
|
cb(Qcur, "Qcur", il);
|
|
6935
7189
|
cb(Kcur, "Kcur", il);
|
|
6936
7190
|
|
|
6937
|
-
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
|
7191
|
+
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
|
6938
7192
|
model.layers[il].wo, NULL,
|
|
6939
|
-
Kcur, Vcur, Qcur, KQ_mask, KQ_pos,
|
|
7193
|
+
Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
|
6940
7194
|
}
|
|
6941
7195
|
|
|
6942
7196
|
if (il == n_layer - 1) {
|
|
@@ -7041,9 +7295,9 @@ struct llm_build_context {
|
|
|
7041
7295
|
ext_factor, attn_factor, beta_fast, beta_slow
|
|
7042
7296
|
);
|
|
7043
7297
|
cb(Kcur, "Kcur", il);
|
|
7044
|
-
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
|
7298
|
+
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
|
7045
7299
|
model.layers[il].wo, NULL,
|
|
7046
|
-
Kcur, Vcur, Qcur, KQ_mask, KQ_pos,
|
|
7300
|
+
Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
|
7047
7301
|
}
|
|
7048
7302
|
|
|
7049
7303
|
if (il == n_layer - 1) {
|
|
@@ -7161,9 +7415,9 @@ struct llm_build_context {
|
|
|
7161
7415
|
);
|
|
7162
7416
|
cb(Kcur, "Kcur", il);
|
|
7163
7417
|
|
|
7164
|
-
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
|
7418
|
+
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
|
7165
7419
|
model.layers[il].wo, NULL,
|
|
7166
|
-
Kcur, Vcur, Qcur, KQ_mask, nullptr,
|
|
7420
|
+
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
|
7167
7421
|
}
|
|
7168
7422
|
|
|
7169
7423
|
if (il == n_layer - 1) {
|
|
@@ -7286,9 +7540,9 @@ struct llm_build_context {
|
|
|
7286
7540
|
);
|
|
7287
7541
|
cb(Kcur, "Kcur", il);
|
|
7288
7542
|
|
|
7289
|
-
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
|
7543
|
+
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
|
7290
7544
|
model.layers[il].wo, model.layers[il].bo,
|
|
7291
|
-
Kcur, Vcur, Qcur, KQ_mask, nullptr,
|
|
7545
|
+
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f, cb, il);
|
|
7292
7546
|
}
|
|
7293
7547
|
|
|
7294
7548
|
if (il == n_layer - 1) {
|
|
@@ -7438,9 +7692,9 @@ struct llm_build_context {
|
|
|
7438
7692
|
);
|
|
7439
7693
|
cb(Kcur, "Kcur", il);
|
|
7440
7694
|
|
|
7441
|
-
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
|
7442
|
-
|
|
7443
|
-
|
|
7695
|
+
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
|
7696
|
+
model.layers[il].wo, NULL,
|
|
7697
|
+
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
|
7444
7698
|
}
|
|
7445
7699
|
|
|
7446
7700
|
if (il == n_layer - 1) {
|
|
@@ -7550,9 +7804,9 @@ struct llm_build_context {
|
|
|
7550
7804
|
|
|
7551
7805
|
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
7552
7806
|
|
|
7553
|
-
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
|
7807
|
+
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
|
7554
7808
|
model.layers[il].wo, model.layers[il].bo,
|
|
7555
|
-
Kcur, Vcur, Qcur, KQ_mask, nullptr,
|
|
7809
|
+
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
|
7556
7810
|
}
|
|
7557
7811
|
|
|
7558
7812
|
if (il == n_layer - 1) {
|
|
@@ -7754,9 +8008,9 @@ struct llm_build_context {
|
|
|
7754
8008
|
);
|
|
7755
8009
|
cb(Vcur, "Vcur", il);
|
|
7756
8010
|
|
|
7757
|
-
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
|
8011
|
+
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
|
7758
8012
|
model.layers[il].wo, model.layers[il].bo,
|
|
7759
|
-
Kcur, Vcur, Q, KQ_mask, nullptr,
|
|
8013
|
+
Kcur, Vcur, Q, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
|
7760
8014
|
}
|
|
7761
8015
|
|
|
7762
8016
|
if (il == n_layer - 1) {
|
|
@@ -7850,9 +8104,9 @@ struct llm_build_context {
|
|
|
7850
8104
|
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
7851
8105
|
cb(Qcur, "Qcur", il);
|
|
7852
8106
|
|
|
7853
|
-
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
|
8107
|
+
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
|
7854
8108
|
model.layers[il].wo, NULL,
|
|
7855
|
-
Kcur, Vcur, Qcur, KQ_mask, KQ_pos,
|
|
8109
|
+
Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
|
7856
8110
|
}
|
|
7857
8111
|
|
|
7858
8112
|
if (il == n_layer - 1) {
|
|
@@ -8143,9 +8397,9 @@ struct llm_build_context {
|
|
|
8143
8397
|
|
|
8144
8398
|
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
8145
8399
|
|
|
8146
|
-
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
|
8400
|
+
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
|
8147
8401
|
model.layers[il].wo, model.layers[il].bo,
|
|
8148
|
-
Kcur, Vcur, Qcur, KQ_mask, KQ_pos,
|
|
8402
|
+
Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
|
8149
8403
|
}
|
|
8150
8404
|
|
|
8151
8405
|
if (il == n_layer - 1) {
|
|
@@ -8274,14 +8528,15 @@ struct llm_build_context {
|
|
|
8274
8528
|
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
8275
8529
|
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
8276
8530
|
|
|
8277
|
-
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
|
8278
|
-
|
|
8279
|
-
|
|
8531
|
+
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
|
8532
|
+
model.layers[il].wo, model.layers[il].bo,
|
|
8533
|
+
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
|
8280
8534
|
} else {
|
|
8281
8535
|
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
8282
|
-
|
|
8536
|
+
|
|
8537
|
+
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
|
8283
8538
|
model.layers[il].wo, model.layers[il].bo,
|
|
8284
|
-
Kcur, Vcur, Qcur, KQ_mask, KQ_pos,
|
|
8539
|
+
Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
|
8285
8540
|
}
|
|
8286
8541
|
}
|
|
8287
8542
|
|
|
@@ -8423,9 +8678,9 @@ struct llm_build_context {
|
|
|
8423
8678
|
);
|
|
8424
8679
|
cb(Kcur, "Kcur", il);
|
|
8425
8680
|
|
|
8426
|
-
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
|
8681
|
+
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
|
8427
8682
|
model.layers[il].wo, NULL,
|
|
8428
|
-
Kcur, Vcur, Qcur, KQ_mask, nullptr,
|
|
8683
|
+
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
|
8429
8684
|
}
|
|
8430
8685
|
|
|
8431
8686
|
if (il == n_layer - 1) {
|
|
@@ -8541,9 +8796,9 @@ struct llm_build_context {
|
|
|
8541
8796
|
);
|
|
8542
8797
|
cb(Kcur, "Kcur", il);
|
|
8543
8798
|
|
|
8544
|
-
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
|
8799
|
+
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
|
8545
8800
|
model.layers[il].wo, NULL,
|
|
8546
|
-
Kcur, Vcur, Qcur, KQ_mask, nullptr,
|
|
8801
|
+
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
|
8547
8802
|
}
|
|
8548
8803
|
|
|
8549
8804
|
if (il == n_layer - 1) {
|
|
@@ -8654,9 +8909,9 @@ struct llm_build_context {
|
|
|
8654
8909
|
);
|
|
8655
8910
|
cb(Kcur, "Kcur", il);
|
|
8656
8911
|
|
|
8657
|
-
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
|
8912
|
+
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
|
8658
8913
|
model.layers[il].wo, model.layers[il].bo,
|
|
8659
|
-
Kcur, Vcur, Qcur, KQ_mask, nullptr,
|
|
8914
|
+
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
|
8660
8915
|
}
|
|
8661
8916
|
|
|
8662
8917
|
if (il == n_layer - 1) {
|
|
@@ -8768,9 +9023,9 @@ struct llm_build_context {
|
|
|
8768
9023
|
);
|
|
8769
9024
|
cb(Kcur, "Kcur", il);
|
|
8770
9025
|
|
|
8771
|
-
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
|
9026
|
+
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
|
8772
9027
|
model.layers[il].wo, model.layers[il].bo,
|
|
8773
|
-
Kcur, Vcur, Qcur, KQ_mask, nullptr,
|
|
9028
|
+
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
|
8774
9029
|
}
|
|
8775
9030
|
|
|
8776
9031
|
if (il == n_layer - 1) {
|
|
@@ -8923,9 +9178,9 @@ struct llm_build_context {
|
|
|
8923
9178
|
);
|
|
8924
9179
|
cb(Kcur, "Kcur", il);
|
|
8925
9180
|
|
|
8926
|
-
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
|
9181
|
+
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
|
8927
9182
|
model.layers[il].wo, model.layers[il].bo,
|
|
8928
|
-
Kcur, Vcur, Qcur, KQ_mask, nullptr,
|
|
9183
|
+
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f, cb, il);
|
|
8929
9184
|
}
|
|
8930
9185
|
|
|
8931
9186
|
if (il == n_layer - 1) {
|
|
@@ -8967,12 +9222,140 @@ struct llm_build_context {
|
|
|
8967
9222
|
|
|
8968
9223
|
cur = ggml_add(ctx0, cur, model.output_b);
|
|
8969
9224
|
cb(cur, "result_output", -1);
|
|
9225
|
+
ggml_build_forward_expand(gf, cur);
|
|
9226
|
+
return gf;
|
|
9227
|
+
}
|
|
9228
|
+
|
|
9229
|
+
struct ggml_cgraph * build_phi3() {
|
|
9230
|
+
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
|
9231
|
+
|
|
9232
|
+
const int64_t n_embd_head = hparams.n_embd_head_v;
|
|
9233
|
+
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
|
|
9234
|
+
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
|
9235
|
+
|
|
9236
|
+
struct ggml_tensor * cur;
|
|
9237
|
+
struct ggml_tensor * inpL;
|
|
9238
|
+
|
|
9239
|
+
inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
|
|
9240
|
+
|
|
9241
|
+
// inp_pos - contains the positions
|
|
9242
|
+
struct ggml_tensor * inp_pos = build_inp_pos();
|
|
9243
|
+
|
|
9244
|
+
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
|
9245
|
+
struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
|
|
9246
|
+
|
|
9247
|
+
for (int il = 0; il < n_layer; ++il) {
|
|
9248
|
+
auto residual = inpL;
|
|
9249
|
+
|
|
9250
|
+
// self-attention
|
|
9251
|
+
{
|
|
9252
|
+
struct ggml_tensor* attn_norm_output = llm_build_norm(ctx0, inpL, hparams,
|
|
9253
|
+
model.layers[il].attn_norm,
|
|
9254
|
+
NULL,
|
|
9255
|
+
LLM_NORM_RMS, cb, il);
|
|
9256
|
+
cb(attn_norm_output, "attn_norm", il);
|
|
9257
|
+
|
|
9258
|
+
struct ggml_tensor * Qcur = nullptr;
|
|
9259
|
+
struct ggml_tensor * Kcur = nullptr;
|
|
9260
|
+
struct ggml_tensor * Vcur = nullptr;
|
|
9261
|
+
|
|
9262
|
+
if (model.layers[il].wqkv) {
|
|
9263
|
+
cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, attn_norm_output);
|
|
9264
|
+
cb(cur, "wqkv", il);
|
|
9265
|
+
|
|
9266
|
+
Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0 * sizeof(float) * (n_embd)));
|
|
9267
|
+
Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1 * sizeof(float) * (n_embd)));
|
|
9268
|
+
Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1 * sizeof(float) * (n_embd + n_embd_gqa)));
|
|
9269
|
+
}
|
|
9270
|
+
else {
|
|
9271
|
+
Qcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wq, attn_norm_output), model.layers[il].bq);
|
|
9272
|
+
Kcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wk, attn_norm_output), model.layers[il].bk);
|
|
9273
|
+
Vcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wv, attn_norm_output), model.layers[il].bv);
|
|
9274
|
+
}
|
|
9275
|
+
|
|
9276
|
+
cb(Qcur, "Qcur", il);
|
|
9277
|
+
cb(Kcur, "Kcur", il);
|
|
9278
|
+
cb(Vcur, "Vcur", il);
|
|
9279
|
+
|
|
9280
|
+
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
9281
|
+
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
9282
|
+
|
|
9283
|
+
Qcur = ggml_rope_custom(
|
|
9284
|
+
ctx0, Qcur, inp_pos, n_rot, rope_type, 0, n_orig_ctx,
|
|
9285
|
+
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
|
9286
|
+
);
|
|
9287
|
+
cb(Qcur, "Qcur", il);
|
|
9288
|
+
|
|
9289
|
+
Qcur = ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd_head)));
|
|
9290
|
+
cb(Qcur, "Qcur", il);
|
|
9291
|
+
|
|
9292
|
+
Kcur = ggml_rope_custom(
|
|
9293
|
+
ctx0, Kcur, inp_pos, n_rot, rope_type, 0, n_orig_ctx,
|
|
9294
|
+
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
|
9295
|
+
);
|
|
9296
|
+
cb(Kcur, "Kcur", il);
|
|
9297
|
+
|
|
9298
|
+
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
|
9299
|
+
model.layers[il].wo, model.layers[il].bo,
|
|
9300
|
+
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f, cb, il);
|
|
9301
|
+
}
|
|
9302
|
+
|
|
9303
|
+
if (il == n_layer - 1) {
|
|
9304
|
+
// skip computing output for unused tokens
|
|
9305
|
+
struct ggml_tensor* inp_out_ids = build_inp_out_ids();
|
|
9306
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
9307
|
+
residual = ggml_get_rows(ctx0, residual, inp_out_ids);
|
|
9308
|
+
}
|
|
9309
|
+
|
|
9310
|
+
cur = ggml_add(ctx0, cur, residual);
|
|
9311
|
+
residual = cur;
|
|
9312
|
+
|
|
9313
|
+
cur = llm_build_norm(ctx0, cur, hparams,
|
|
9314
|
+
model.layers[il].ffn_norm, NULL,
|
|
9315
|
+
LLM_NORM_RMS, cb, il);
|
|
9316
|
+
cb(cur, "ffn_norm", il);
|
|
9317
|
+
|
|
9318
|
+
// FF
|
|
9319
|
+
// special-case: the up and gate tensors are merged into a single tensor
|
|
9320
|
+
// TOOD: support into llm_build_ffn
|
|
9321
|
+
{
|
|
9322
|
+
struct ggml_tensor* up = ggml_mul_mat(ctx0, model.layers[il].ffn_up, cur);
|
|
9323
|
+
cb(up, "ffn_up", il);
|
|
9324
|
+
|
|
9325
|
+
auto g = ggml_cont(ctx0, ggml_view_2d(ctx0, up, up->ne[0] / 2, up->ne[1], ggml_row_size(up->type, up->ne[0]), 0));
|
|
9326
|
+
auto y = ggml_cont(ctx0, ggml_view_2d(ctx0, up, up->ne[0] / 2, up->ne[1], ggml_row_size(up->type, up->ne[0]), up->nb[1] / 2));
|
|
9327
|
+
|
|
9328
|
+
y = ggml_mul(ctx0, y, ggml_silu(ctx0, g));
|
|
9329
|
+
cb(y, "ffn_gate", il);
|
|
9330
|
+
|
|
9331
|
+
auto down = ggml_mul_mat(ctx0, model.layers[il].ffn_down, y);
|
|
9332
|
+
cb(down, "ffn_down", il);
|
|
9333
|
+
|
|
9334
|
+
cur = down;
|
|
9335
|
+
cb(cur, "ffn_out", il);
|
|
9336
|
+
}
|
|
9337
|
+
|
|
9338
|
+
cur = ggml_add(ctx0, residual, cur);
|
|
9339
|
+
cb(cur, "l_out", il);
|
|
9340
|
+
|
|
9341
|
+
inpL = cur;
|
|
9342
|
+
}
|
|
9343
|
+
|
|
9344
|
+
cur = llm_build_norm(ctx0, inpL, hparams,
|
|
9345
|
+
model.output_norm,
|
|
9346
|
+
NULL,
|
|
9347
|
+
LLM_NORM_RMS, cb, -1);
|
|
9348
|
+
cb(cur, "result_norm", -1);
|
|
9349
|
+
|
|
9350
|
+
cur = ggml_mul_mat(ctx0, model.output, cur);
|
|
9351
|
+
cb(cur, "result_output", -1);
|
|
8970
9352
|
|
|
8971
9353
|
ggml_build_forward_expand(gf, cur);
|
|
8972
9354
|
|
|
8973
9355
|
return gf;
|
|
8974
9356
|
}
|
|
8975
9357
|
|
|
9358
|
+
|
|
8976
9359
|
struct ggml_cgraph * build_plamo() {
|
|
8977
9360
|
struct ggml_cgraph * gf = ggml_new_graph(ctx0);
|
|
8978
9361
|
|
|
@@ -9025,9 +9408,9 @@ struct llm_build_context {
|
|
|
9025
9408
|
ext_factor, attn_factor, beta_fast, beta_slow);
|
|
9026
9409
|
cb(Kcur, "Kcur", il);
|
|
9027
9410
|
|
|
9028
|
-
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
|
9411
|
+
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
|
9029
9412
|
model.layers[il].wo, NULL,
|
|
9030
|
-
Kcur, Vcur, Qcur, KQ_mask, nullptr,
|
|
9413
|
+
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
|
9031
9414
|
}
|
|
9032
9415
|
struct ggml_tensor * sa_out = cur;
|
|
9033
9416
|
|
|
@@ -9128,9 +9511,9 @@ struct llm_build_context {
|
|
|
9128
9511
|
|
|
9129
9512
|
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
9130
9513
|
|
|
9131
|
-
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
|
9514
|
+
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
|
9132
9515
|
model.layers[il].wo, model.layers[il].bo,
|
|
9133
|
-
Kcur, Vcur, Qcur, KQ_mask, nullptr,
|
|
9516
|
+
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
|
9134
9517
|
}
|
|
9135
9518
|
|
|
9136
9519
|
if (il == n_layer - 1) {
|
|
@@ -9235,9 +9618,9 @@ struct llm_build_context {
|
|
|
9235
9618
|
);
|
|
9236
9619
|
cb(Kcur, "Kcur", il);
|
|
9237
9620
|
|
|
9238
|
-
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
|
9621
|
+
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
|
9239
9622
|
model.layers[il].wo, model.layers[il].bo,
|
|
9240
|
-
Kcur, Vcur, Qcur, KQ_mask, nullptr,
|
|
9623
|
+
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
|
9241
9624
|
}
|
|
9242
9625
|
|
|
9243
9626
|
if (il == n_layer - 1) {
|
|
@@ -9351,9 +9734,9 @@ struct llm_build_context {
|
|
|
9351
9734
|
);
|
|
9352
9735
|
cb(Kcur, "Kcur", il);
|
|
9353
9736
|
|
|
9354
|
-
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
|
9737
|
+
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
|
9355
9738
|
model.layers[il].wo, NULL,
|
|
9356
|
-
Kcur, Vcur, Qcur, KQ_mask, nullptr,
|
|
9739
|
+
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
|
9357
9740
|
}
|
|
9358
9741
|
|
|
9359
9742
|
if (il == n_layer - 1) {
|
|
@@ -9468,9 +9851,9 @@ struct llm_build_context {
|
|
|
9468
9851
|
);
|
|
9469
9852
|
cb(Kcur, "Kcur", il);
|
|
9470
9853
|
|
|
9471
|
-
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
|
9854
|
+
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
|
9472
9855
|
model.layers[il].wo, model.layers[il].bo,
|
|
9473
|
-
Kcur, Vcur, Qcur, KQ_mask, nullptr,
|
|
9856
|
+
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
|
9474
9857
|
}
|
|
9475
9858
|
|
|
9476
9859
|
if (il == n_layer - 1) {
|
|
@@ -9598,9 +9981,9 @@ struct llm_build_context {
|
|
|
9598
9981
|
);
|
|
9599
9982
|
cb(Kcur, "Kcur", il);
|
|
9600
9983
|
|
|
9601
|
-
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
|
9984
|
+
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
|
9602
9985
|
model.layers[il].wo, model.layers[il].bo,
|
|
9603
|
-
Kcur, Vcur, Qcur, KQ_mask, nullptr,
|
|
9986
|
+
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
|
9604
9987
|
}
|
|
9605
9988
|
|
|
9606
9989
|
if (il == n_layer - 1) {
|
|
@@ -9719,9 +10102,9 @@ struct llm_build_context {
|
|
|
9719
10102
|
ext_factor, attn_factor, beta_fast, beta_slow);
|
|
9720
10103
|
cb(Kcur, "Kcur", il);
|
|
9721
10104
|
|
|
9722
|
-
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
|
10105
|
+
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
|
9723
10106
|
model.layers[il].wo, NULL,
|
|
9724
|
-
Kcur, Vcur, Qcur, KQ_mask, nullptr,
|
|
10107
|
+
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f, cb, il);
|
|
9725
10108
|
}
|
|
9726
10109
|
|
|
9727
10110
|
if (il == n_layer - 1) {
|
|
@@ -9838,9 +10221,9 @@ struct llm_build_context {
|
|
|
9838
10221
|
);
|
|
9839
10222
|
cb(Kcur, "Kcur", il);
|
|
9840
10223
|
|
|
9841
|
-
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
|
10224
|
+
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
|
9842
10225
|
model.layers[il].wo, model.layers[il].bo,
|
|
9843
|
-
Kcur, Vcur, Qcur, KQ_mask, nullptr,
|
|
10226
|
+
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
|
9844
10227
|
}
|
|
9845
10228
|
|
|
9846
10229
|
if (il == n_layer - 1) {
|
|
@@ -10128,9 +10511,9 @@ struct llm_build_context {
|
|
|
10128
10511
|
);
|
|
10129
10512
|
cb(Kcur, "Kcur", il);
|
|
10130
10513
|
|
|
10131
|
-
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
|
10514
|
+
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
|
10132
10515
|
model.layers[il].wo, model.layers[il].bo,
|
|
10133
|
-
Kcur, Vcur, Qcur, KQ_mask, nullptr,
|
|
10516
|
+
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
|
10134
10517
|
}
|
|
10135
10518
|
|
|
10136
10519
|
if (il == n_layer - 1) {
|
|
@@ -10259,9 +10642,9 @@ struct llm_build_context {
|
|
|
10259
10642
|
);
|
|
10260
10643
|
cb(Kcur, "Kcur", il);
|
|
10261
10644
|
|
|
10262
|
-
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
|
10645
|
+
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
|
10263
10646
|
model.layers[il].wo, nullptr,
|
|
10264
|
-
Kcur, Vcur, Qcur, KQ_mask, nullptr,
|
|
10647
|
+
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
|
10265
10648
|
}
|
|
10266
10649
|
|
|
10267
10650
|
if (il == n_layer - 1) {
|
|
@@ -10474,6 +10857,10 @@ static struct ggml_cgraph * llama_build_graph(
|
|
|
10474
10857
|
{
|
|
10475
10858
|
result = llm.build_phi2();
|
|
10476
10859
|
} break;
|
|
10860
|
+
case LLM_ARCH_PHI3:
|
|
10861
|
+
{
|
|
10862
|
+
result = llm.build_phi3();
|
|
10863
|
+
} break;
|
|
10477
10864
|
case LLM_ARCH_PLAMO:
|
|
10478
10865
|
{
|
|
10479
10866
|
result = llm.build_plamo();
|
|
@@ -10684,7 +11071,9 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
|
|
|
10684
11071
|
}
|
|
10685
11072
|
}
|
|
10686
11073
|
|
|
10687
|
-
|
|
11074
|
+
// ALiBi requires the KQ_pos tensor to provide the sequence position of each token in the batch
|
|
11075
|
+
// this allows to process multiple sequences in parallel with ALiBi-based models
|
|
11076
|
+
if (hparams.use_alibi) {
|
|
10688
11077
|
const int64_t n_kv = kv_self.n;
|
|
10689
11078
|
|
|
10690
11079
|
GGML_ASSERT(lctx.inp_KQ_pos);
|
|
@@ -11066,7 +11455,7 @@ static int llama_decode_internal(
|
|
|
11066
11455
|
// a heuristic, to avoid attending the full cache if it is not yet utilized
|
|
11067
11456
|
// after enough generations, the benefit from this heuristic disappears
|
|
11068
11457
|
// if we start defragmenting the cache, the benefit from this will be more important
|
|
11069
|
-
kv_self.n = std::min(kv_self.size, std::max(
|
|
11458
|
+
kv_self.n = std::min(kv_self.size, std::max(256u, GGML_PAD(llama_kv_cache_cell_max(kv_self), 256)));
|
|
11070
11459
|
//kv_self.n = llama_kv_cache_cell_max(kv_self);
|
|
11071
11460
|
}
|
|
11072
11461
|
}
|
|
@@ -11234,6 +11623,10 @@ static int llama_decode_internal(
|
|
|
11234
11623
|
}
|
|
11235
11624
|
}
|
|
11236
11625
|
|
|
11626
|
+
// Reset state for the next token before backend sync, to allow the CPU activities in the reset to
|
|
11627
|
+
// overlap with device computation.
|
|
11628
|
+
ggml_backend_sched_reset(lctx.sched);
|
|
11629
|
+
|
|
11237
11630
|
return 0;
|
|
11238
11631
|
}
|
|
11239
11632
|
|
|
@@ -11259,7 +11652,9 @@ static void llama_kv_cache_defrag_internal(struct llama_context & lctx) {
|
|
|
11259
11652
|
// each move requires 6*n_layer tensors (see build_defrag)
|
|
11260
11653
|
// - source view, destination view, copy operation
|
|
11261
11654
|
// - x2 for keys and values
|
|
11262
|
-
const uint32_t max_moves = LLAMA_MAX_NODES/(6*n_layer);
|
|
11655
|
+
//const uint32_t max_moves = LLAMA_MAX_NODES/(6*n_layer);
|
|
11656
|
+
// TODO: tmp fix https://github.com/ggerganov/llama.cpp/issues/6685#issuecomment-2057579516
|
|
11657
|
+
const uint32_t max_moves = (LLAMA_MAX_NODES - 2*n_layer)/(6*n_layer);
|
|
11263
11658
|
|
|
11264
11659
|
// determine which KV cells to move where
|
|
11265
11660
|
//
|
|
@@ -11575,7 +11970,7 @@ static bool llama_is_user_defined_token(const llama_vocab& vocab, llama_token id
|
|
|
11575
11970
|
static uint8_t llama_token_to_byte(const llama_vocab& vocab, llama_token id) {
|
|
11576
11971
|
GGML_ASSERT(llama_vocab_get_type(vocab) != LLAMA_VOCAB_TYPE_NONE);
|
|
11577
11972
|
GGML_ASSERT(llama_is_byte_token(vocab, id));
|
|
11578
|
-
const auto& token_data = vocab.id_to_token.at(id);
|
|
11973
|
+
const auto & token_data = vocab.id_to_token.at(id);
|
|
11579
11974
|
switch (llama_vocab_get_type(vocab)) {
|
|
11580
11975
|
case LLAMA_VOCAB_TYPE_SPM: {
|
|
11581
11976
|
auto buf = token_data.text.substr(3, 2);
|
|
@@ -11583,7 +11978,7 @@ static uint8_t llama_token_to_byte(const llama_vocab& vocab, llama_token id) {
|
|
|
11583
11978
|
}
|
|
11584
11979
|
case LLAMA_VOCAB_TYPE_BPE: {
|
|
11585
11980
|
GGML_ASSERT(false);
|
|
11586
|
-
return unicode_utf8_to_byte(token_data.text);
|
|
11981
|
+
return unicode_utf8_to_byte(token_data.text); // TODO: why is this here after GGML_ASSERT?
|
|
11587
11982
|
}
|
|
11588
11983
|
case LLAMA_VOCAB_TYPE_WPM: {
|
|
11589
11984
|
GGML_ASSERT(false);
|
|
@@ -11805,7 +12200,94 @@ struct llm_tokenizer_bpe {
|
|
|
11805
12200
|
|
|
11806
12201
|
void tokenize(const std::string & text, std::vector<llama_vocab::id> & output) {
|
|
11807
12202
|
int final_prev_index = -1;
|
|
11808
|
-
|
|
12203
|
+
|
|
12204
|
+
std::vector<std::string> word_collection;
|
|
12205
|
+
switch (vocab.type) {
|
|
12206
|
+
case LLAMA_VOCAB_TYPE_BPE:
|
|
12207
|
+
switch (vocab.type_pre) {
|
|
12208
|
+
case LLAMA_VOCAB_PRE_TYPE_LLAMA3:
|
|
12209
|
+
case LLAMA_VOCAB_PRE_TYPE_DBRX:
|
|
12210
|
+
word_collection = unicode_regex_split(text, {
|
|
12211
|
+
// original regex from tokenizer.json
|
|
12212
|
+
//"(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
|
|
12213
|
+
|
|
12214
|
+
// adapted: https://github.com/ggerganov/llama.cpp/pull/6920#issuecomment-2080233989
|
|
12215
|
+
"(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
|
|
12216
|
+
});
|
|
12217
|
+
break;
|
|
12218
|
+
case LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM:
|
|
12219
|
+
word_collection = unicode_regex_split(text, {
|
|
12220
|
+
"[\r\n]",
|
|
12221
|
+
"\\s?[A-Za-zµÀ-ÖØ-öø-ƺƼ-ƿDŽ-ʓʕ-ʯͰ-ͳͶͷͻ-ͽͿΆΈ-ΊΌΎ-ΡΣ-ϵϷ-ҁҊ-ԯԱ-ՖႠ-ჅᎠ-Ᏽᏸ-ᏽᲐ-ᲺᲽ-Ჿᴀ-ᴫᵫ-ᵷᵹ-ᶚḀ-ἕἘ-Ἕἠ-ὅὈ-Ὅὐ-ὗὙὛὝὟ-ώᾀ-ᾴᾶ-ᾼιῂ-ῄῆ-ῌῐ-ΐῖ-Ίῠ-Ῥῲ-ῴῶ-ῼℂℇℊ-ℓℕℙ-ℝℤΩℨK-ℭℯ-ℴℹℼ-ℿⅅ-ⅉⅎↃↄⰀ-ⱻⱾ-ⳤⳫ-ⳮⳲⳳꙀ-ꙭꚀ-ꚛꜢ-ꝯꝱ-ꞇꞋ-ꞎꭰ-ꮿff-stﬓ-ﬗA-Za-z𐐀-𐑏𐒰-𐓓𐓘-𐓻𐲀-𐲲𐳀-𐳲𑢠-𑣟𞤀-𞥃]+",
|
|
12222
|
+
"\\s?[!-/:-~!-/:-~‘-‟ -。]+",
|
|
12223
|
+
"\\s+$",
|
|
12224
|
+
"[一-龥ࠀ-一가-]+",
|
|
12225
|
+
"\\p{N}+",
|
|
12226
|
+
});
|
|
12227
|
+
break;
|
|
12228
|
+
case LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER:
|
|
12229
|
+
word_collection = unicode_regex_split(text, {
|
|
12230
|
+
"[\r\n]",
|
|
12231
|
+
"\\s?\\p{L}+",
|
|
12232
|
+
"\\s?\\p{P}+",
|
|
12233
|
+
"[一-龥ࠀ-一가-]+",
|
|
12234
|
+
"\\p{N}",
|
|
12235
|
+
});
|
|
12236
|
+
break;
|
|
12237
|
+
case LLAMA_VOCAB_PRE_TYPE_FALCON:
|
|
12238
|
+
word_collection = unicode_regex_split(text, {
|
|
12239
|
+
"[\\p{P}\\$\\+<=>\\^~\\|]+",
|
|
12240
|
+
"'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
|
|
12241
|
+
"[0-9][0-9][0-9]",
|
|
12242
|
+
});
|
|
12243
|
+
break;
|
|
12244
|
+
case LLAMA_VOCAB_PRE_TYPE_MPT:
|
|
12245
|
+
// TODO: MPT pre-tokenization regexes are unknown
|
|
12246
|
+
// the following are close, but not exact. run the following:
|
|
12247
|
+
// ./bin/test-tokenizer-0 ../models/ggml-vocab-mpt.gguf
|
|
12248
|
+
GGML_ASSERT("MPT pre-tokenization regexes are unknown - fixes needed");
|
|
12249
|
+
word_collection = unicode_regex_split(text, {
|
|
12250
|
+
"\\s?\\p{L}+",
|
|
12251
|
+
"\\s?\\p{P}+",
|
|
12252
|
+
"'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
|
|
12253
|
+
});
|
|
12254
|
+
break;
|
|
12255
|
+
case LLAMA_VOCAB_PRE_TYPE_STARCODER:
|
|
12256
|
+
case LLAMA_VOCAB_PRE_TYPE_REFACT:
|
|
12257
|
+
case LLAMA_VOCAB_PRE_TYPE_COMMAND_R:
|
|
12258
|
+
word_collection = unicode_regex_split(text, {
|
|
12259
|
+
"\\p{N}",
|
|
12260
|
+
"'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
|
|
12261
|
+
});
|
|
12262
|
+
break;
|
|
12263
|
+
case LLAMA_VOCAB_PRE_TYPE_GPT2:
|
|
12264
|
+
case LLAMA_VOCAB_PRE_TYPE_OLMO:
|
|
12265
|
+
word_collection = unicode_regex_split(text, {
|
|
12266
|
+
"'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
|
|
12267
|
+
});
|
|
12268
|
+
break;
|
|
12269
|
+
case LLAMA_VOCAB_PRE_TYPE_QWEN2:
|
|
12270
|
+
word_collection = unicode_regex_split(text, {
|
|
12271
|
+
// original regex from tokenizer.json
|
|
12272
|
+
// "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+"
|
|
12273
|
+
"(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
|
|
12274
|
+
});
|
|
12275
|
+
break;
|
|
12276
|
+
default:
|
|
12277
|
+
// default regex for BPE tokenization pre-processing
|
|
12278
|
+
word_collection = unicode_regex_split(text, {
|
|
12279
|
+
"[\\p{P}\\$\\+<=>\\^~\\|]+",
|
|
12280
|
+
"'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
|
|
12281
|
+
"\\p{N}+",
|
|
12282
|
+
"[0-9][0-9][0-9]",
|
|
12283
|
+
});
|
|
12284
|
+
break;
|
|
12285
|
+
}
|
|
12286
|
+
break;
|
|
12287
|
+
default:
|
|
12288
|
+
GGML_ASSERT(false);
|
|
12289
|
+
break;
|
|
12290
|
+
}
|
|
11809
12291
|
|
|
11810
12292
|
symbols_final.clear();
|
|
11811
12293
|
|
|
@@ -11932,145 +12414,6 @@ private:
|
|
|
11932
12414
|
work_queue.push(bigram);
|
|
11933
12415
|
}
|
|
11934
12416
|
|
|
11935
|
-
std::vector<std::string> bpe_gpt2_preprocess(const std::string & text) {
|
|
11936
|
-
std::vector<std::string> bpe_words;
|
|
11937
|
-
std::vector<std::string> bpe_encoded_words;
|
|
11938
|
-
|
|
11939
|
-
std::string token = "";
|
|
11940
|
-
// GPT2 system regex: 's|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+
|
|
11941
|
-
bool collecting_numeric = false;
|
|
11942
|
-
bool collecting_letter = false;
|
|
11943
|
-
bool collecting_special = false;
|
|
11944
|
-
bool collecting_whitespace_lookahead = false;
|
|
11945
|
-
bool collecting = false;
|
|
11946
|
-
|
|
11947
|
-
std::vector<std::string> text_utf;
|
|
11948
|
-
text_utf.reserve(text.size());
|
|
11949
|
-
bpe_words.reserve(text.size());
|
|
11950
|
-
bpe_encoded_words.reserve(text.size());
|
|
11951
|
-
|
|
11952
|
-
const auto cpts = unicode_cpts_from_utf8(text);
|
|
11953
|
-
for (size_t i = 0; i < cpts.size(); ++i)
|
|
11954
|
-
text_utf.emplace_back(unicode_cpt_to_utf8(cpts[i]));
|
|
11955
|
-
|
|
11956
|
-
for (int i = 0; i < (int)text_utf.size(); i++) {
|
|
11957
|
-
const std::string & utf_char = text_utf[i];
|
|
11958
|
-
bool split_condition = false;
|
|
11959
|
-
int bytes_remain = text_utf.size() - i;
|
|
11960
|
-
// forward backward lookups
|
|
11961
|
-
const std::string & utf_char_next = (i + 1 < (int)text_utf.size()) ? text_utf[i + 1] : "";
|
|
11962
|
-
const std::string & utf_char_next_next = (i + 2 < (int)text_utf.size()) ? text_utf[i + 2] : "";
|
|
11963
|
-
|
|
11964
|
-
// handling contractions
|
|
11965
|
-
if (!split_condition && bytes_remain >= 2) {
|
|
11966
|
-
// 's|'t|'m|'d
|
|
11967
|
-
if (utf_char == "\'" && (utf_char_next == "s" || utf_char_next == "t" || utf_char_next == "m" || utf_char_next == "d")) {
|
|
11968
|
-
split_condition = true;
|
|
11969
|
-
}
|
|
11970
|
-
if (split_condition) {
|
|
11971
|
-
if (token.size()) {
|
|
11972
|
-
bpe_words.emplace_back(token); // push previous content as token
|
|
11973
|
-
}
|
|
11974
|
-
token = utf_char + utf_char_next;
|
|
11975
|
-
bpe_words.emplace_back(token);
|
|
11976
|
-
token = "";
|
|
11977
|
-
i++;
|
|
11978
|
-
continue;
|
|
11979
|
-
}
|
|
11980
|
-
}
|
|
11981
|
-
if (!split_condition && bytes_remain >= 3) {
|
|
11982
|
-
// 're|'ve|'ll
|
|
11983
|
-
if (utf_char == "\'" && (
|
|
11984
|
-
(utf_char_next == "r" && utf_char_next_next == "e") ||
|
|
11985
|
-
(utf_char_next == "v" && utf_char_next_next == "e") ||
|
|
11986
|
-
(utf_char_next == "l" && utf_char_next_next == "l"))
|
|
11987
|
-
) {
|
|
11988
|
-
split_condition = true;
|
|
11989
|
-
}
|
|
11990
|
-
if (split_condition) {
|
|
11991
|
-
// current token + next token can be defined
|
|
11992
|
-
if (token.size()) {
|
|
11993
|
-
bpe_words.emplace_back(token); // push previous content as token
|
|
11994
|
-
}
|
|
11995
|
-
token = utf_char + utf_char_next + utf_char_next_next;
|
|
11996
|
-
bpe_words.emplace_back(token); // the contraction
|
|
11997
|
-
token = "";
|
|
11998
|
-
i += 2;
|
|
11999
|
-
continue;
|
|
12000
|
-
}
|
|
12001
|
-
}
|
|
12002
|
-
|
|
12003
|
-
if (!split_condition && !collecting) {
|
|
12004
|
-
if (unicode_cpt_type(utf_char) == CODEPOINT_TYPE_LETTER || (!token.size() && utf_char == " " && unicode_cpt_type(utf_char_next) == CODEPOINT_TYPE_LETTER)) {
|
|
12005
|
-
collecting_letter = true;
|
|
12006
|
-
collecting = true;
|
|
12007
|
-
}
|
|
12008
|
-
else if (unicode_cpt_type(utf_char) == CODEPOINT_TYPE_DIGIT || (!token.size() && utf_char == " " && unicode_cpt_type(utf_char_next) == CODEPOINT_TYPE_DIGIT)) {
|
|
12009
|
-
collecting_numeric = true;
|
|
12010
|
-
collecting = true;
|
|
12011
|
-
}
|
|
12012
|
-
else if (
|
|
12013
|
-
((unicode_cpt_type(utf_char) != CODEPOINT_TYPE_LETTER && unicode_cpt_type(utf_char) != CODEPOINT_TYPE_DIGIT) && (unicode_cpt_type(utf_char) != CODEPOINT_TYPE_WHITESPACE)) ||
|
|
12014
|
-
(!token.size() && utf_char == " " && unicode_cpt_type(utf_char_next) != CODEPOINT_TYPE_LETTER && unicode_cpt_type(utf_char_next) != CODEPOINT_TYPE_DIGIT && unicode_cpt_type(utf_char_next) != CODEPOINT_TYPE_WHITESPACE)
|
|
12015
|
-
) {
|
|
12016
|
-
collecting_special = true;
|
|
12017
|
-
collecting = true;
|
|
12018
|
-
}
|
|
12019
|
-
else if (unicode_cpt_type(utf_char) == CODEPOINT_TYPE_WHITESPACE && unicode_cpt_type(utf_char_next) == CODEPOINT_TYPE_WHITESPACE) {
|
|
12020
|
-
collecting_whitespace_lookahead = true;
|
|
12021
|
-
collecting = true;
|
|
12022
|
-
}
|
|
12023
|
-
else if (unicode_cpt_type(utf_char) == CODEPOINT_TYPE_WHITESPACE) {
|
|
12024
|
-
split_condition = true;
|
|
12025
|
-
}
|
|
12026
|
-
}
|
|
12027
|
-
else if (!split_condition && collecting) {
|
|
12028
|
-
if (collecting_letter && unicode_cpt_type(utf_char) != CODEPOINT_TYPE_LETTER) {
|
|
12029
|
-
split_condition = true;
|
|
12030
|
-
}
|
|
12031
|
-
else if (collecting_numeric && unicode_cpt_type(utf_char) != CODEPOINT_TYPE_DIGIT) {
|
|
12032
|
-
split_condition = true;
|
|
12033
|
-
}
|
|
12034
|
-
else if (collecting_special && (unicode_cpt_type(utf_char) == CODEPOINT_TYPE_LETTER || unicode_cpt_type(utf_char) == CODEPOINT_TYPE_DIGIT || unicode_cpt_type(utf_char) == CODEPOINT_TYPE_WHITESPACE)) {
|
|
12035
|
-
split_condition = true;
|
|
12036
|
-
}
|
|
12037
|
-
else if (collecting_whitespace_lookahead && (unicode_cpt_type(utf_char_next) == CODEPOINT_TYPE_LETTER || unicode_cpt_type(utf_char_next) == CODEPOINT_TYPE_DIGIT)) {
|
|
12038
|
-
split_condition = true;
|
|
12039
|
-
}
|
|
12040
|
-
}
|
|
12041
|
-
|
|
12042
|
-
if (utf_char_next == "") {
|
|
12043
|
-
split_condition = true; // final
|
|
12044
|
-
token += utf_char;
|
|
12045
|
-
}
|
|
12046
|
-
|
|
12047
|
-
if (split_condition) {
|
|
12048
|
-
if (token.size()) {
|
|
12049
|
-
bpe_words.emplace_back(token);
|
|
12050
|
-
}
|
|
12051
|
-
token = utf_char;
|
|
12052
|
-
collecting = false;
|
|
12053
|
-
collecting_letter = false;
|
|
12054
|
-
collecting_numeric = false;
|
|
12055
|
-
collecting_special = false;
|
|
12056
|
-
collecting_whitespace_lookahead = false;
|
|
12057
|
-
}
|
|
12058
|
-
else {
|
|
12059
|
-
token += utf_char;
|
|
12060
|
-
}
|
|
12061
|
-
}
|
|
12062
|
-
|
|
12063
|
-
for (std::string & word : bpe_words) {
|
|
12064
|
-
std::string encoded_token = "";
|
|
12065
|
-
for (char & c : word) {
|
|
12066
|
-
encoded_token += unicode_byte_to_utf8(c);
|
|
12067
|
-
}
|
|
12068
|
-
bpe_encoded_words.emplace_back(encoded_token);
|
|
12069
|
-
}
|
|
12070
|
-
|
|
12071
|
-
return bpe_encoded_words;
|
|
12072
|
-
}
|
|
12073
|
-
|
|
12074
12417
|
const llama_vocab & vocab;
|
|
12075
12418
|
|
|
12076
12419
|
std::vector<llm_symbol> symbols;
|
|
@@ -12145,7 +12488,7 @@ struct llm_tokenizer_wpm {
|
|
|
12145
12488
|
continue;
|
|
12146
12489
|
}
|
|
12147
12490
|
code = unicode_tolower(code);
|
|
12148
|
-
if (type ==
|
|
12491
|
+
if (type == CODEPOINT_TYPE_SEPARATOR) {
|
|
12149
12492
|
code = ' ';
|
|
12150
12493
|
}
|
|
12151
12494
|
std::string s = unicode_cpt_to_utf8(code);
|
|
@@ -12390,7 +12733,7 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
|
|
|
12390
12733
|
} break;
|
|
12391
12734
|
case LLAMA_VOCAB_TYPE_BPE:
|
|
12392
12735
|
{
|
|
12393
|
-
if (add_special && vocab.special_add_bos
|
|
12736
|
+
if (add_special && vocab.special_add_bos != 0) {
|
|
12394
12737
|
GGML_ASSERT(vocab.special_bos_id != -1);
|
|
12395
12738
|
output.push_back(vocab.special_bos_id);
|
|
12396
12739
|
}
|
|
@@ -13478,7 +13821,7 @@ llama_token llama_sample_token_greedy(struct llama_context * ctx, llama_token_da
|
|
|
13478
13821
|
return result;
|
|
13479
13822
|
}
|
|
13480
13823
|
|
|
13481
|
-
llama_token
|
|
13824
|
+
llama_token llama_sample_token_with_rng(struct llama_context * ctx, llama_token_data_array * candidates, std::mt19937 & rng) {
|
|
13482
13825
|
GGML_ASSERT(ctx);
|
|
13483
13826
|
|
|
13484
13827
|
const int64_t t_start_sample_us = ggml_time_us();
|
|
@@ -13491,7 +13834,6 @@ llama_token llama_sample_token(struct llama_context * ctx, llama_token_data_arra
|
|
|
13491
13834
|
}
|
|
13492
13835
|
|
|
13493
13836
|
std::discrete_distribution<> dist(probs.begin(), probs.end());
|
|
13494
|
-
auto & rng = ctx->rng;
|
|
13495
13837
|
int idx = dist(rng);
|
|
13496
13838
|
|
|
13497
13839
|
llama_token result = candidates->data[idx].id;
|
|
@@ -13501,6 +13843,10 @@ llama_token llama_sample_token(struct llama_context * ctx, llama_token_data_arra
|
|
|
13501
13843
|
return result;
|
|
13502
13844
|
}
|
|
13503
13845
|
|
|
13846
|
+
llama_token llama_sample_token(struct llama_context * ctx, llama_token_data_array * candidates) {
|
|
13847
|
+
return llama_sample_token_with_rng(ctx, candidates, ctx->rng);
|
|
13848
|
+
}
|
|
13849
|
+
|
|
13504
13850
|
void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar * grammar, llama_token token) {
|
|
13505
13851
|
const int64_t t_start_sample_us = ggml_time_us();
|
|
13506
13852
|
|
|
@@ -13829,13 +14175,16 @@ static void llama_tensor_dequantize_internal(
|
|
|
13829
14175
|
if (qtype.to_float == NULL) {
|
|
13830
14176
|
throw std::runtime_error(format("type %s unsupported for integer quantization: no dequantization available", ggml_type_name(tensor->type)));
|
|
13831
14177
|
}
|
|
13832
|
-
} else if (tensor->type != GGML_TYPE_F16
|
|
14178
|
+
} else if (tensor->type != GGML_TYPE_F16 &&
|
|
14179
|
+
tensor->type != GGML_TYPE_BF16) {
|
|
13833
14180
|
throw std::runtime_error(format("cannot dequantize/convert tensor type %s", ggml_type_name(tensor->type)));
|
|
13834
14181
|
}
|
|
13835
14182
|
|
|
13836
14183
|
if (nthread < 2) {
|
|
13837
14184
|
if (tensor->type == GGML_TYPE_F16) {
|
|
13838
14185
|
ggml_fp16_to_fp32_row((ggml_fp16_t *)tensor->data, f32_output, nelements);
|
|
14186
|
+
} else if (tensor->type == GGML_TYPE_BF16) {
|
|
14187
|
+
ggml_bf16_to_fp32_row((ggml_bf16_t *)tensor->data, f32_output, nelements);
|
|
13839
14188
|
} else if (ggml_is_quantized(tensor->type)) {
|
|
13840
14189
|
qtype.to_float(tensor->data, f32_output, nelements);
|
|
13841
14190
|
} else {
|
|
@@ -13844,7 +14193,14 @@ static void llama_tensor_dequantize_internal(
|
|
|
13844
14193
|
return;
|
|
13845
14194
|
}
|
|
13846
14195
|
|
|
13847
|
-
size_t block_size
|
|
14196
|
+
size_t block_size;
|
|
14197
|
+
if (tensor->type == GGML_TYPE_F16 ||
|
|
14198
|
+
tensor->type == GGML_TYPE_BF16) {
|
|
14199
|
+
block_size = 1;
|
|
14200
|
+
} else {
|
|
14201
|
+
block_size = (size_t)ggml_blck_size(tensor->type);
|
|
14202
|
+
}
|
|
14203
|
+
|
|
13848
14204
|
size_t block_size_bytes = ggml_type_size(tensor->type);
|
|
13849
14205
|
|
|
13850
14206
|
GGML_ASSERT(nelements % block_size == 0);
|
|
@@ -13863,6 +14219,8 @@ static void llama_tensor_dequantize_internal(
|
|
|
13863
14219
|
auto compute = [qtype] (ggml_type typ, uint8_t * inbuf, float * outbuf, int nels) {
|
|
13864
14220
|
if (typ == GGML_TYPE_F16) {
|
|
13865
14221
|
ggml_fp16_to_fp32_row((ggml_fp16_t *)inbuf, outbuf, nels);
|
|
14222
|
+
} else if (typ == GGML_TYPE_BF16) {
|
|
14223
|
+
ggml_bf16_to_fp32_row((ggml_bf16_t *)inbuf, outbuf, nels);
|
|
13866
14224
|
} else {
|
|
13867
14225
|
qtype.to_float(inbuf, outbuf, nels);
|
|
13868
14226
|
}
|
|
@@ -14159,14 +14517,20 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
|
|
|
14159
14517
|
}
|
|
14160
14518
|
|
|
14161
14519
|
static size_t llama_tensor_quantize_internal(enum ggml_type new_type, const float * f32_data, void * new_data, const int64_t chunk_size, int64_t nrows, int64_t n_per_row, const float * imatrix, std::vector<std::thread> & workers, const int nthread) {
|
|
14162
|
-
std::mutex mutex;
|
|
14163
|
-
int64_t counter = 0;
|
|
14164
|
-
size_t new_size = 0;
|
|
14165
14520
|
if (nthread < 2) {
|
|
14166
14521
|
// single-thread
|
|
14167
|
-
|
|
14522
|
+
size_t new_size = ggml_quantize_chunk(new_type, f32_data, new_data, 0, nrows, n_per_row, imatrix);
|
|
14523
|
+
if (!ggml_validate_row_data(new_type, new_data, new_size)) {
|
|
14524
|
+
throw std::runtime_error("quantized data validation failed");
|
|
14525
|
+
}
|
|
14526
|
+
return new_size;
|
|
14168
14527
|
}
|
|
14169
|
-
|
|
14528
|
+
|
|
14529
|
+
std::mutex mutex;
|
|
14530
|
+
int64_t counter = 0;
|
|
14531
|
+
size_t new_size = 0;
|
|
14532
|
+
bool valid = true;
|
|
14533
|
+
auto compute = [&mutex, &counter, &new_size, &valid, new_type, f32_data, new_data, chunk_size,
|
|
14170
14534
|
nrows, n_per_row, imatrix]() {
|
|
14171
14535
|
const int64_t nrows_per_chunk = chunk_size / n_per_row;
|
|
14172
14536
|
size_t local_size = 0;
|
|
@@ -14181,7 +14545,17 @@ static size_t llama_tensor_quantize_internal(enum ggml_type new_type, const floa
|
|
|
14181
14545
|
}
|
|
14182
14546
|
lock.unlock();
|
|
14183
14547
|
const int64_t this_nrow = std::min(nrows - first_row, nrows_per_chunk);
|
|
14184
|
-
|
|
14548
|
+
size_t this_size = ggml_quantize_chunk(new_type, f32_data, new_data, first_row * n_per_row, this_nrow, n_per_row, imatrix);
|
|
14549
|
+
local_size += this_size;
|
|
14550
|
+
|
|
14551
|
+
// validate the quantized data
|
|
14552
|
+
const size_t row_size = ggml_row_size(new_type, n_per_row);
|
|
14553
|
+
void * this_data = (char *) new_data + first_row * row_size;
|
|
14554
|
+
if (!ggml_validate_row_data(new_type, this_data, this_size)) {
|
|
14555
|
+
std::unique_lock<std::mutex> lock(mutex);
|
|
14556
|
+
valid = false;
|
|
14557
|
+
break;
|
|
14558
|
+
}
|
|
14185
14559
|
}
|
|
14186
14560
|
};
|
|
14187
14561
|
for (int it = 0; it < nthread - 1; ++it) {
|
|
@@ -14190,6 +14564,9 @@ static size_t llama_tensor_quantize_internal(enum ggml_type new_type, const floa
|
|
|
14190
14564
|
compute();
|
|
14191
14565
|
for (auto & w : workers) { w.join(); }
|
|
14192
14566
|
workers.clear();
|
|
14567
|
+
if (!valid) {
|
|
14568
|
+
throw std::runtime_error("quantized data validation failed");
|
|
14569
|
+
}
|
|
14193
14570
|
return new_size;
|
|
14194
14571
|
}
|
|
14195
14572
|
|
|
@@ -14204,6 +14581,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
|
14204
14581
|
case LLAMA_FTYPE_MOSTLY_Q5_1: default_type = GGML_TYPE_Q5_1; break;
|
|
14205
14582
|
case LLAMA_FTYPE_MOSTLY_Q8_0: default_type = GGML_TYPE_Q8_0; break;
|
|
14206
14583
|
case LLAMA_FTYPE_MOSTLY_F16: default_type = GGML_TYPE_F16; break;
|
|
14584
|
+
case LLAMA_FTYPE_MOSTLY_BF16: default_type = GGML_TYPE_BF16; break;
|
|
14207
14585
|
case LLAMA_FTYPE_ALL_F32: default_type = GGML_TYPE_F32; break;
|
|
14208
14586
|
|
|
14209
14587
|
// K-quants
|
|
@@ -14252,7 +14630,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
|
14252
14630
|
auto v = (std::vector<llama_model_kv_override>*)params->kv_overrides;
|
|
14253
14631
|
kv_overrides = v->data();
|
|
14254
14632
|
}
|
|
14255
|
-
llama_model_loader ml(fname_inp, use_mmap, kv_overrides);
|
|
14633
|
+
llama_model_loader ml(fname_inp, use_mmap, /*check_tensors*/ true, kv_overrides);
|
|
14256
14634
|
ml.init_mappings(false); // no prefetching
|
|
14257
14635
|
|
|
14258
14636
|
llama_model model;
|
|
@@ -14290,11 +14668,13 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
|
14290
14668
|
for (auto & o : overrides) {
|
|
14291
14669
|
if (o.key[0] == 0) break;
|
|
14292
14670
|
if (o.tag == LLAMA_KV_OVERRIDE_TYPE_FLOAT) {
|
|
14293
|
-
gguf_set_val_f32(ctx_out, o.key, o.
|
|
14671
|
+
gguf_set_val_f32(ctx_out, o.key, o.val_f64);
|
|
14294
14672
|
} else if (o.tag == LLAMA_KV_OVERRIDE_TYPE_INT) {
|
|
14295
|
-
gguf_set_val_i32(ctx_out, o.key, o.
|
|
14673
|
+
gguf_set_val_i32(ctx_out, o.key, o.val_i64);
|
|
14296
14674
|
} else if (o.tag == LLAMA_KV_OVERRIDE_TYPE_BOOL) {
|
|
14297
|
-
gguf_set_val_bool(ctx_out, o.key, o.
|
|
14675
|
+
gguf_set_val_bool(ctx_out, o.key, o.val_bool);
|
|
14676
|
+
} else if (o.tag == LLAMA_KV_OVERRIDE_TYPE_STR) {
|
|
14677
|
+
gguf_set_val_str(ctx_out, o.key, o.val_str);
|
|
14298
14678
|
} else {
|
|
14299
14679
|
LLAMA_LOG_WARN("%s: unknown KV override type for key %s\n", __func__, o.key);
|
|
14300
14680
|
}
|
|
@@ -14336,26 +14716,74 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
|
14336
14716
|
std::vector<no_init<uint8_t>> work;
|
|
14337
14717
|
std::vector<no_init<float>> f32_conv_buf;
|
|
14338
14718
|
|
|
14719
|
+
uint16_t n_split = 1;
|
|
14720
|
+
// Assume split index is continuous
|
|
14721
|
+
if (params->keep_split) {
|
|
14722
|
+
for (int i = 0; i < ml.n_tensors; ++i) {
|
|
14723
|
+
n_split = std::max(uint16_t(ml.get_weight(i)->idx+1), n_split);
|
|
14724
|
+
}
|
|
14725
|
+
}
|
|
14726
|
+
std::vector<gguf_context*> ctx_outs(n_split, NULL);
|
|
14727
|
+
ctx_outs[0] = ctx_out;
|
|
14728
|
+
|
|
14339
14729
|
// populate the original tensors so we get an initial meta data
|
|
14340
14730
|
for (int i = 0; i < ml.n_tensors; ++i) {
|
|
14341
|
-
|
|
14342
|
-
|
|
14731
|
+
auto weight = ml.get_weight(i);
|
|
14732
|
+
uint16_t i_split = params->keep_split ? weight->idx : 0;
|
|
14733
|
+
struct ggml_tensor * tensor = weight->tensor;
|
|
14734
|
+
if (ctx_outs[i_split] == NULL) {
|
|
14735
|
+
ctx_outs[i_split] = gguf_init_empty();
|
|
14736
|
+
}
|
|
14737
|
+
gguf_add_tensor(ctx_outs[i_split], tensor);
|
|
14343
14738
|
}
|
|
14344
14739
|
|
|
14345
|
-
|
|
14346
|
-
|
|
14347
|
-
|
|
14348
|
-
|
|
14740
|
+
// Set split info if needed
|
|
14741
|
+
if (n_split > 1) {
|
|
14742
|
+
for (size_t i = 0; i < ctx_outs.size(); ++i) {
|
|
14743
|
+
gguf_set_val_u16(ctx_outs[i], ml.llm_kv(LLM_KV_SPLIT_NO).c_str(), i);
|
|
14744
|
+
gguf_set_val_u16(ctx_outs[i], ml.llm_kv(LLM_KV_SPLIT_COUNT).c_str(), n_split);
|
|
14745
|
+
gguf_set_val_i32(ctx_outs[i], ml.llm_kv(LLM_KV_SPLIT_TENSORS_COUNT).c_str(), ml.n_tensors);
|
|
14746
|
+
}
|
|
14747
|
+
}
|
|
14349
14748
|
|
|
14350
|
-
|
|
14749
|
+
int cur_split = -1;
|
|
14750
|
+
std::ofstream fout;
|
|
14751
|
+
auto close_ofstream = [&]() {
|
|
14752
|
+
// Write metadata and close file handler
|
|
14753
|
+
if (fout.is_open()) {
|
|
14754
|
+
fout.seekp(0);
|
|
14755
|
+
std::vector<uint8_t> data(gguf_get_meta_size(ctx_outs[cur_split]));
|
|
14756
|
+
gguf_get_meta_data(ctx_outs[cur_split], data.data());
|
|
14757
|
+
fout.write((const char *) data.data(), data.size());
|
|
14758
|
+
fout.close();
|
|
14759
|
+
}
|
|
14760
|
+
};
|
|
14761
|
+
auto new_ofstream = [&](int index) {
|
|
14762
|
+
cur_split = index;
|
|
14763
|
+
GGML_ASSERT(ctx_outs[cur_split] && "Find uninitialized gguf_context");
|
|
14764
|
+
std::string fname = fname_out;
|
|
14765
|
+
if (params->keep_split) {
|
|
14766
|
+
char split_path[PATH_MAX] = {0};
|
|
14767
|
+
llama_split_path(split_path, sizeof(split_path), fname_out.c_str(), cur_split, n_split);
|
|
14768
|
+
fname = std::string(split_path);
|
|
14769
|
+
}
|
|
14351
14770
|
|
|
14352
|
-
|
|
14353
|
-
|
|
14771
|
+
fout = std::ofstream(fname, std::ios::binary);
|
|
14772
|
+
fout.exceptions(std::ofstream::failbit); // fail fast on write errors
|
|
14773
|
+
const size_t meta_size = gguf_get_meta_size(ctx_outs[cur_split]);
|
|
14774
|
+
// placeholder for the meta data
|
|
14775
|
+
::zeros(fout, meta_size);
|
|
14776
|
+
};
|
|
14354
14777
|
|
|
14355
14778
|
const auto tn = LLM_TN(model.arch);
|
|
14356
|
-
|
|
14779
|
+
new_ofstream(0);
|
|
14357
14780
|
for (int i = 0; i < ml.n_tensors; ++i) {
|
|
14358
|
-
|
|
14781
|
+
auto weight = ml.get_weight(i);
|
|
14782
|
+
struct ggml_tensor * tensor = weight->tensor;
|
|
14783
|
+
if (weight->idx != cur_split && params->keep_split) {
|
|
14784
|
+
close_ofstream();
|
|
14785
|
+
new_ofstream(weight->idx);
|
|
14786
|
+
}
|
|
14359
14787
|
|
|
14360
14788
|
const std::string name = ggml_get_name(tensor);
|
|
14361
14789
|
|
|
@@ -14510,26 +14938,18 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
|
14510
14938
|
total_size_new += new_size;
|
|
14511
14939
|
|
|
14512
14940
|
// update the gguf meta data as we go
|
|
14513
|
-
gguf_set_tensor_type(
|
|
14514
|
-
gguf_set_tensor_data(
|
|
14941
|
+
gguf_set_tensor_type(ctx_outs[cur_split], name.c_str(), new_type);
|
|
14942
|
+
gguf_set_tensor_data(ctx_outs[cur_split], name.c_str(), new_data, new_size);
|
|
14515
14943
|
|
|
14516
14944
|
// write tensor data + padding
|
|
14517
14945
|
fout.write((const char *) new_data, new_size);
|
|
14518
14946
|
zeros(fout, GGML_PAD(new_size, align) - new_size);
|
|
14519
14947
|
}
|
|
14520
|
-
|
|
14521
|
-
|
|
14522
|
-
|
|
14523
|
-
fout.seekp(0);
|
|
14524
|
-
std::vector<uint8_t> data(gguf_get_meta_size(ctx_out));
|
|
14525
|
-
gguf_get_meta_data(ctx_out, data.data());
|
|
14526
|
-
fout.write((const char *) data.data(), data.size());
|
|
14948
|
+
close_ofstream();
|
|
14949
|
+
for (auto & c:ctx_outs) {
|
|
14950
|
+
gguf_free(c);
|
|
14527
14951
|
}
|
|
14528
14952
|
|
|
14529
|
-
fout.close();
|
|
14530
|
-
|
|
14531
|
-
gguf_free(ctx_out);
|
|
14532
|
-
|
|
14533
14953
|
LLAMA_LOG_INFO("%s: model size = %8.2f MB\n", __func__, total_size_org/1024.0/1024.0);
|
|
14534
14954
|
LLAMA_LOG_INFO("%s: quant size = %8.2f MB\n", __func__, total_size_new/1024.0/1024.0);
|
|
14535
14955
|
|
|
@@ -14573,7 +14993,7 @@ static int llama_apply_lora_from_file_internal(
|
|
|
14573
14993
|
std::unique_ptr<llama_model_loader> ml;
|
|
14574
14994
|
if (path_base_model) {
|
|
14575
14995
|
LLAMA_LOG_INFO("%s: loading base model from '%s'\n", __func__, path_base_model);
|
|
14576
|
-
ml.reset(new llama_model_loader(path_base_model, /*use_mmap*/ true, /*kv_overrides*/ nullptr));
|
|
14996
|
+
ml.reset(new llama_model_loader(path_base_model, /*use_mmap*/ true, /*check_tensors*/ false, /*kv_overrides*/ nullptr));
|
|
14577
14997
|
ml->init_mappings(/*prefetch*/ false); // no prefetching
|
|
14578
14998
|
}
|
|
14579
14999
|
|
|
@@ -14832,6 +15252,7 @@ struct llama_model_params llama_model_default_params() {
|
|
|
14832
15252
|
/*.vocab_only =*/ false,
|
|
14833
15253
|
/*.use_mmap =*/ true,
|
|
14834
15254
|
/*.use_mlock =*/ false,
|
|
15255
|
+
/*.check_tensors =*/ false,
|
|
14835
15256
|
};
|
|
14836
15257
|
|
|
14837
15258
|
#ifdef GGML_USE_METAL
|
|
@@ -14868,6 +15289,7 @@ struct llama_context_params llama_context_default_params() {
|
|
|
14868
15289
|
/*.logits_all =*/ false,
|
|
14869
15290
|
/*.embeddings =*/ false,
|
|
14870
15291
|
/*.offload_kqv =*/ true,
|
|
15292
|
+
/*.flash_attn =*/ false,
|
|
14871
15293
|
/*.abort_callback =*/ nullptr,
|
|
14872
15294
|
/*.abort_callback_data =*/ nullptr,
|
|
14873
15295
|
};
|
|
@@ -14885,6 +15307,7 @@ struct llama_model_quantize_params llama_model_quantize_default_params() {
|
|
|
14885
15307
|
/*.quantize_output_tensor =*/ true,
|
|
14886
15308
|
/*.only_copy =*/ false,
|
|
14887
15309
|
/*.pure =*/ false,
|
|
15310
|
+
/*.keep_split =*/ false,
|
|
14888
15311
|
/*.imatrix =*/ nullptr,
|
|
14889
15312
|
/*.kv_overrides =*/ nullptr,
|
|
14890
15313
|
};
|
|
@@ -15033,6 +15456,7 @@ struct llama_context * llama_new_context_with_model(
|
|
|
15033
15456
|
cparams.defrag_thold = params.defrag_thold;
|
|
15034
15457
|
cparams.embeddings = params.embeddings;
|
|
15035
15458
|
cparams.offload_kqv = params.offload_kqv;
|
|
15459
|
+
cparams.flash_attn = params.flash_attn;
|
|
15036
15460
|
cparams.pooling_type = params.pooling_type;
|
|
15037
15461
|
|
|
15038
15462
|
cparams.n_ctx = params.n_ctx == 0 ? hparams.n_ctx_train : params.n_ctx;
|
|
@@ -15040,12 +15464,20 @@ struct llama_context * llama_new_context_with_model(
|
|
|
15040
15464
|
cparams.rope_freq_scale = params.rope_freq_scale == 0.0f ? hparams.rope_freq_scale_train : params.rope_freq_scale;
|
|
15041
15465
|
|
|
15042
15466
|
// this is necessary due to kv_self.n being padded later during inference
|
|
15043
|
-
cparams.n_ctx
|
|
15467
|
+
cparams.n_ctx = GGML_PAD(cparams.n_ctx, 256);
|
|
15044
15468
|
|
|
15045
15469
|
// with causal attention, the batch size is limited by the context size
|
|
15046
15470
|
cparams.n_batch = hparams.causal_attn ? std::min(cparams.n_ctx, params.n_batch) : params.n_batch;
|
|
15047
|
-
cparams.n_ubatch = std::min(cparams.n_batch, params.n_ubatch == 0 ? params.n_batch : params.n_ubatch);
|
|
15048
15471
|
|
|
15472
|
+
// the batch has to be at least GGML_KQ_MASK_PAD because we will be padding the KQ_mask
|
|
15473
|
+
// this is required by GPU kernels in order to avoid out-of-bounds accesses (e.g. ggml_flash_attn_ext)
|
|
15474
|
+
// ref: https://github.com/ggerganov/llama.cpp/pull/5021
|
|
15475
|
+
if (cparams.n_batch < GGML_KQ_MASK_PAD) {
|
|
15476
|
+
LLAMA_LOG_WARN("%s: n_batch is less than GGML_KQ_MASK_PAD - increasing to %d\n", __func__, GGML_KQ_MASK_PAD);
|
|
15477
|
+
cparams.n_batch = GGML_KQ_MASK_PAD;
|
|
15478
|
+
}
|
|
15479
|
+
|
|
15480
|
+
cparams.n_ubatch = std::min(cparams.n_batch, params.n_ubatch == 0 ? params.n_batch : params.n_ubatch);
|
|
15049
15481
|
|
|
15050
15482
|
cparams.n_yarn_orig_ctx = params.yarn_orig_ctx != 0 ? params.yarn_orig_ctx :
|
|
15051
15483
|
hparams.n_yarn_orig_ctx != 0 ? hparams.n_yarn_orig_ctx :
|
|
@@ -15077,6 +15509,16 @@ struct llama_context * llama_new_context_with_model(
|
|
|
15077
15509
|
}
|
|
15078
15510
|
}
|
|
15079
15511
|
|
|
15512
|
+
if (cparams.flash_attn && hparams.use_alibi) {
|
|
15513
|
+
LLAMA_LOG_WARN("%s: flash_attn is not yet compatible with ALiBi - forcing off\n", __func__);
|
|
15514
|
+
cparams.flash_attn = false;
|
|
15515
|
+
}
|
|
15516
|
+
|
|
15517
|
+
if (cparams.flash_attn && model->arch == LLM_ARCH_GROK) {
|
|
15518
|
+
LLAMA_LOG_WARN("%s: flash_attn is not compatible with Grok - forcing off\n", __func__);
|
|
15519
|
+
cparams.flash_attn = false;
|
|
15520
|
+
}
|
|
15521
|
+
|
|
15080
15522
|
if (params.seed == LLAMA_DEFAULT_SEED) {
|
|
15081
15523
|
params.seed = time(NULL);
|
|
15082
15524
|
}
|
|
@@ -15084,6 +15526,7 @@ struct llama_context * llama_new_context_with_model(
|
|
|
15084
15526
|
LLAMA_LOG_INFO("%s: n_ctx = %u\n", __func__, cparams.n_ctx);
|
|
15085
15527
|
LLAMA_LOG_INFO("%s: n_batch = %u\n", __func__, cparams.n_batch);
|
|
15086
15528
|
LLAMA_LOG_INFO("%s: n_ubatch = %u\n", __func__, cparams.n_ubatch);
|
|
15529
|
+
LLAMA_LOG_INFO("%s: flash_attn = %d\n", __func__, cparams.flash_attn);
|
|
15087
15530
|
LLAMA_LOG_INFO("%s: freq_base = %.1f\n", __func__, cparams.rope_freq_base);
|
|
15088
15531
|
LLAMA_LOG_INFO("%s: freq_scale = %g\n", __func__, cparams.rope_freq_scale);
|
|
15089
15532
|
|
|
@@ -15212,7 +15655,7 @@ struct llama_context * llama_new_context_with_model(
|
|
|
15212
15655
|
}
|
|
15213
15656
|
ctx->backends.push_back(ctx->backend_cpu);
|
|
15214
15657
|
|
|
15215
|
-
if (!llama_kv_cache_init(ctx->kv_self, ctx
|
|
15658
|
+
if (!llama_kv_cache_init(ctx->kv_self, ctx, type_k, type_v, kv_size, cparams.offload_kqv)) {
|
|
15216
15659
|
LLAMA_LOG_ERROR("%s: llama_kv_cache_init() failed for self-attention cache\n", __func__);
|
|
15217
15660
|
llama_free(ctx);
|
|
15218
15661
|
return nullptr;
|
|
@@ -15393,6 +15836,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
|
|
|
15393
15836
|
case LLM_ARCH_QWEN2:
|
|
15394
15837
|
case LLM_ARCH_QWEN2MOE:
|
|
15395
15838
|
case LLM_ARCH_PHI2:
|
|
15839
|
+
case LLM_ARCH_PHI3:
|
|
15396
15840
|
case LLM_ARCH_GEMMA:
|
|
15397
15841
|
case LLM_ARCH_STARCODER2:
|
|
15398
15842
|
return LLAMA_ROPE_TYPE_NEOX;
|
|
@@ -15406,6 +15850,10 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
|
|
|
15406
15850
|
return LLAMA_ROPE_TYPE_NONE;
|
|
15407
15851
|
}
|
|
15408
15852
|
|
|
15853
|
+
enum llama_pooling_type llama_pooling_type(const struct llama_context * ctx) {
|
|
15854
|
+
return ctx->cparams.pooling_type;
|
|
15855
|
+
}
|
|
15856
|
+
|
|
15409
15857
|
int32_t llama_n_vocab(const struct llama_model * model) {
|
|
15410
15858
|
return model->hparams.n_vocab;
|
|
15411
15859
|
}
|
|
@@ -15806,6 +16254,7 @@ size_t llama_state_get_size(const struct llama_context * ctx) {
|
|
|
15806
16254
|
const size_t s_kv_head = sizeof(uint32_t);
|
|
15807
16255
|
const size_t s_kv_size = sizeof(uint32_t);
|
|
15808
16256
|
const size_t s_kv_used = sizeof(uint32_t);
|
|
16257
|
+
const size_t s_v_trans = sizeof(uint32_t);
|
|
15809
16258
|
const size_t s_kv = ctx->kv_self.total_size();
|
|
15810
16259
|
const size_t s_kv_cell = sizeof(llama_pos) + sizeof(size_t) + cparams.n_seq_max*sizeof(llama_seq_id);
|
|
15811
16260
|
const size_t s_kv_cells = ctx->kv_self.size * s_kv_cell;
|
|
@@ -15823,10 +16272,14 @@ size_t llama_state_get_size(const struct llama_context * ctx) {
|
|
|
15823
16272
|
+ s_kv_head
|
|
15824
16273
|
+ s_kv_size
|
|
15825
16274
|
+ s_kv_used
|
|
16275
|
+
+ s_v_trans
|
|
15826
16276
|
+ s_kv
|
|
15827
16277
|
+ s_kv_cells
|
|
15828
16278
|
);
|
|
15829
16279
|
|
|
16280
|
+
// on session change it is very likely that the state size has changed - so we need to update this function
|
|
16281
|
+
static_assert(LLAMA_SESSION_VERSION == 6, "So you just bumped the session version - good. But did you remember to update llama_state_get_size?");
|
|
16282
|
+
|
|
15830
16283
|
return s_total;
|
|
15831
16284
|
}
|
|
15832
16285
|
|
|
@@ -15884,6 +16337,8 @@ struct llama_data_file_context : llama_data_context {
|
|
|
15884
16337
|
*
|
|
15885
16338
|
*/
|
|
15886
16339
|
static void llama_state_get_data_internal(struct llama_context * ctx, llama_data_context * data_ctx) {
|
|
16340
|
+
llama_synchronize(ctx);
|
|
16341
|
+
|
|
15887
16342
|
// copy rng
|
|
15888
16343
|
{
|
|
15889
16344
|
std::ostringstream rng_ss;
|
|
@@ -15970,11 +16425,13 @@ static void llama_state_get_data_internal(struct llama_context * ctx, llama_data
|
|
|
15970
16425
|
const uint32_t kv_size = kv_self.size;
|
|
15971
16426
|
const size_t kv_buf_size = kv_self.total_size() / (kv_size ? kv_size : 1) * kv_head;
|
|
15972
16427
|
const uint32_t kv_used = kv_self.used;
|
|
16428
|
+
const uint32_t v_trans = kv_self.v_trans ? 1 : 0;
|
|
15973
16429
|
|
|
15974
16430
|
data_ctx->write(&kv_buf_size, sizeof(kv_buf_size));
|
|
15975
16431
|
data_ctx->write(&kv_head, sizeof(kv_head));
|
|
15976
16432
|
data_ctx->write(&kv_size, sizeof(kv_size));
|
|
15977
16433
|
data_ctx->write(&kv_used, sizeof(kv_used));
|
|
16434
|
+
data_ctx->write(&v_trans, sizeof(v_trans));
|
|
15978
16435
|
|
|
15979
16436
|
if (kv_buf_size) {
|
|
15980
16437
|
const size_t pre_kv_buf_size = data_ctx->get_size_written();
|
|
@@ -15987,7 +16444,7 @@ static void llama_state_get_data_internal(struct llama_context * ctx, llama_data
|
|
|
15987
16444
|
ggml_backend_tensor_get(kv_self.k_l[il], tmp_buf.data(), 0, tmp_buf.size());
|
|
15988
16445
|
data_ctx->write(tmp_buf.data(), tmp_buf.size());
|
|
15989
16446
|
|
|
15990
|
-
if (kv_self.recurrent) {
|
|
16447
|
+
if (kv_self.recurrent || !kv_self.v_trans) {
|
|
15991
16448
|
// v is contiguous for recurrent models
|
|
15992
16449
|
// TODO: use other tensors for state models than k and v
|
|
15993
16450
|
const size_t v_size = ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa*kv_head);
|
|
@@ -16036,6 +16493,8 @@ size_t llama_state_get_data(struct llama_context * ctx, uint8_t * dst) {
|
|
|
16036
16493
|
|
|
16037
16494
|
// Sets the state reading from the specified source address
|
|
16038
16495
|
size_t llama_state_set_data(struct llama_context * ctx, const uint8_t * src) {
|
|
16496
|
+
llama_synchronize(ctx);
|
|
16497
|
+
|
|
16039
16498
|
const uint8_t * inp = src;
|
|
16040
16499
|
|
|
16041
16500
|
// set rng
|
|
@@ -16118,11 +16577,15 @@ size_t llama_state_set_data(struct llama_context * ctx, const uint8_t * src) {
|
|
|
16118
16577
|
uint32_t kv_head;
|
|
16119
16578
|
uint32_t kv_size;
|
|
16120
16579
|
uint32_t kv_used;
|
|
16580
|
+
uint32_t v_trans;
|
|
16121
16581
|
|
|
16122
16582
|
memcpy(&kv_buf_size, inp, sizeof(kv_buf_size)); inp += sizeof(kv_buf_size);
|
|
16123
16583
|
memcpy(&kv_head, inp, sizeof(kv_head)); inp += sizeof(kv_head);
|
|
16124
16584
|
memcpy(&kv_size, inp, sizeof(kv_size)); inp += sizeof(kv_size);
|
|
16125
16585
|
memcpy(&kv_used, inp, sizeof(kv_used)); inp += sizeof(kv_used);
|
|
16586
|
+
memcpy(&v_trans, inp, sizeof(v_trans)); inp += sizeof(v_trans);
|
|
16587
|
+
|
|
16588
|
+
GGML_ASSERT(kv_self.v_trans == (bool) v_trans); // incompatible V transposition
|
|
16126
16589
|
|
|
16127
16590
|
if (kv_self.size != kv_size) {
|
|
16128
16591
|
// the KV cache needs to be big enough to load all the KV cells from the saved state
|
|
@@ -16132,6 +16595,8 @@ size_t llama_state_set_data(struct llama_context * ctx, const uint8_t * src) {
|
|
|
16132
16595
|
__func__, kv_head, kv_size, kv_self.size);
|
|
16133
16596
|
}
|
|
16134
16597
|
|
|
16598
|
+
llama_kv_cache_clear(ctx);
|
|
16599
|
+
|
|
16135
16600
|
if (kv_buf_size) {
|
|
16136
16601
|
const size_t pre_kv_buf_size = inp - src;
|
|
16137
16602
|
|
|
@@ -16143,7 +16608,7 @@ size_t llama_state_set_data(struct llama_context * ctx, const uint8_t * src) {
|
|
|
16143
16608
|
ggml_backend_tensor_set(kv_self.k_l[il], inp, 0, k_size);
|
|
16144
16609
|
inp += k_size;
|
|
16145
16610
|
|
|
16146
|
-
if (kv_self.recurrent) {
|
|
16611
|
+
if (kv_self.recurrent || !kv_self.v_trans) {
|
|
16147
16612
|
// v is contiguous for recurrent models
|
|
16148
16613
|
// TODO: use other tensors for state models than k and v
|
|
16149
16614
|
const size_t v_size = ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa*kv_head);
|
|
@@ -16165,8 +16630,6 @@ size_t llama_state_set_data(struct llama_context * ctx, const uint8_t * src) {
|
|
|
16165
16630
|
GGML_ASSERT(kv_buf_size == inp - src - pre_kv_buf_size);
|
|
16166
16631
|
}
|
|
16167
16632
|
|
|
16168
|
-
llama_kv_cache_clear(ctx);
|
|
16169
|
-
|
|
16170
16633
|
ctx->kv_self.head = kv_head;
|
|
16171
16634
|
ctx->kv_self.used = kv_used;
|
|
16172
16635
|
|
|
@@ -16340,6 +16803,8 @@ size_t llama_state_seq_get_size(struct llama_context* ctx, llama_seq_id seq_id)
|
|
|
16340
16803
|
}
|
|
16341
16804
|
|
|
16342
16805
|
static size_t llama_state_seq_get_data_internal(struct llama_context * ctx, llama_data_context & data_ctx, llama_seq_id seq_id) {
|
|
16806
|
+
llama_synchronize(ctx);
|
|
16807
|
+
|
|
16343
16808
|
const auto & kv_self = ctx->kv_self;
|
|
16344
16809
|
GGML_ASSERT(!kv_self.recurrent); // not implemented
|
|
16345
16810
|
|
|
@@ -16424,28 +16889,49 @@ static size_t llama_state_seq_get_data_internal(struct llama_context * ctx, llam
|
|
|
16424
16889
|
}
|
|
16425
16890
|
}
|
|
16426
16891
|
|
|
16427
|
-
//
|
|
16428
|
-
|
|
16429
|
-
|
|
16430
|
-
|
|
16431
|
-
|
|
16432
|
-
|
|
16892
|
+
// TODO: simplify, reduce copy-paste
|
|
16893
|
+
if (!kv_self.v_trans) {
|
|
16894
|
+
for (int il = 0; il < (int)n_layer; ++il) {
|
|
16895
|
+
// Write value type
|
|
16896
|
+
const int32_t v_type_i = (int32_t)kv_self.v_l[il]->type;
|
|
16897
|
+
data_ctx.write(&v_type_i, sizeof(v_type_i));
|
|
16433
16898
|
|
|
16434
|
-
|
|
16435
|
-
|
|
16436
|
-
|
|
16899
|
+
// Write row size of value
|
|
16900
|
+
const size_t v_size_row = ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa);
|
|
16901
|
+
data_ctx.write(&v_size_row, sizeof(v_size_row));
|
|
16437
16902
|
|
|
16438
|
-
|
|
16439
|
-
for (uint32_t j = 0; j < n_embd_v_gqa; ++j) {
|
|
16440
|
-
// Read each range of cells of v_size_el length each into tmp_buf and write out
|
|
16903
|
+
// Read each range of cells of v_size length each into tmp_buf and write out
|
|
16441
16904
|
for (const auto & range : cell_ranges) {
|
|
16442
16905
|
const size_t range_size = range.second - range.first;
|
|
16443
|
-
|
|
16444
|
-
tmp_buf.
|
|
16445
|
-
ggml_backend_tensor_get(kv_self.v_l[il], tmp_buf.data(), src_offset, tmp_buf.size());
|
|
16906
|
+
tmp_buf.resize(range_size * v_size_row);
|
|
16907
|
+
ggml_backend_tensor_get(kv_self.v_l[il], tmp_buf.data(), range.first * v_size_row, range_size * v_size_row);
|
|
16446
16908
|
data_ctx.write(tmp_buf.data(), tmp_buf.size());
|
|
16447
16909
|
}
|
|
16448
16910
|
}
|
|
16911
|
+
} else {
|
|
16912
|
+
// For the values, they are transposed, so we also need the element size and get the element ranges from each row
|
|
16913
|
+
const uint32_t kv_size = kv_self.size;
|
|
16914
|
+
for (int il = 0; il < (int)n_layer; ++il) {
|
|
16915
|
+
// Write value type
|
|
16916
|
+
const int32_t v_type_i = (int32_t)kv_self.v_l[il]->type;
|
|
16917
|
+
data_ctx.write(&v_type_i, sizeof(v_type_i));
|
|
16918
|
+
|
|
16919
|
+
// Write element size
|
|
16920
|
+
const size_t v_size_el = ggml_type_size(kv_self.v_l[il]->type);
|
|
16921
|
+
data_ctx.write(&v_size_el, sizeof(v_size_el));
|
|
16922
|
+
|
|
16923
|
+
// For each row, we get the element values of each cell
|
|
16924
|
+
for (uint32_t j = 0; j < n_embd_v_gqa; ++j) {
|
|
16925
|
+
// Read each range of cells of v_size_el length each into tmp_buf and write out
|
|
16926
|
+
for (const auto & range : cell_ranges) {
|
|
16927
|
+
const size_t range_size = range.second - range.first;
|
|
16928
|
+
const size_t src_offset = (range.first + j * kv_size) * v_size_el;
|
|
16929
|
+
tmp_buf.resize(range_size * v_size_el);
|
|
16930
|
+
ggml_backend_tensor_get(kv_self.v_l[il], tmp_buf.data(), src_offset, tmp_buf.size());
|
|
16931
|
+
data_ctx.write(tmp_buf.data(), tmp_buf.size());
|
|
16932
|
+
}
|
|
16933
|
+
}
|
|
16934
|
+
}
|
|
16449
16935
|
}
|
|
16450
16936
|
|
|
16451
16937
|
return data_ctx.get_size_written();
|
|
@@ -16457,6 +16943,8 @@ size_t llama_state_seq_get_data(struct llama_context* ctx, uint8_t* dst, llama_s
|
|
|
16457
16943
|
}
|
|
16458
16944
|
|
|
16459
16945
|
size_t llama_state_seq_set_data(struct llama_context * ctx, const uint8_t * src, llama_seq_id dest_seq_id) {
|
|
16946
|
+
llama_synchronize(ctx);
|
|
16947
|
+
|
|
16460
16948
|
auto & kv_self = ctx->kv_self;
|
|
16461
16949
|
GGML_ASSERT(!kv_self.recurrent); // not implemented
|
|
16462
16950
|
|
|
@@ -16568,41 +17056,75 @@ size_t llama_state_seq_set_data(struct llama_context * ctx, const uint8_t * src,
|
|
|
16568
17056
|
}
|
|
16569
17057
|
}
|
|
16570
17058
|
|
|
16571
|
-
//
|
|
16572
|
-
|
|
16573
|
-
|
|
16574
|
-
|
|
16575
|
-
|
|
16576
|
-
|
|
16577
|
-
|
|
16578
|
-
|
|
16579
|
-
|
|
16580
|
-
|
|
16581
|
-
|
|
16582
|
-
|
|
17059
|
+
// TODO: simplify, reduce copy-paste
|
|
17060
|
+
if (!kv_self.v_trans) {
|
|
17061
|
+
for (int il = 0; il < (int)n_layer; ++il) {
|
|
17062
|
+
// Read type of value
|
|
17063
|
+
int32_t v_type_i_ref;
|
|
17064
|
+
memcpy(&v_type_i_ref, inp, sizeof(v_type_i_ref));
|
|
17065
|
+
inp += sizeof(v_type_i_ref);
|
|
17066
|
+
const int32_t v_type_i = (int32_t)kv_self.v_l[il]->type;
|
|
17067
|
+
if (v_type_i != v_type_i_ref) {
|
|
17068
|
+
llama_kv_cache_seq_rm(kv_self, dest_seq_id, -1, -1);
|
|
17069
|
+
LLAMA_LOG_ERROR("%s: mismatched value type (%d != %d, layer %d)\n", __func__, v_type_i, v_type_i_ref, il);
|
|
17070
|
+
return 0;
|
|
17071
|
+
}
|
|
16583
17072
|
|
|
16584
|
-
|
|
16585
|
-
|
|
16586
|
-
|
|
16587
|
-
|
|
16588
|
-
|
|
16589
|
-
|
|
16590
|
-
|
|
16591
|
-
|
|
16592
|
-
|
|
16593
|
-
|
|
17073
|
+
// Read row size of value
|
|
17074
|
+
size_t v_size_row_ref;
|
|
17075
|
+
memcpy(&v_size_row_ref, inp, sizeof(v_size_row_ref));
|
|
17076
|
+
inp += sizeof(v_size_row_ref);
|
|
17077
|
+
const size_t v_size_row = ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa);
|
|
17078
|
+
if (v_size_row != v_size_row_ref) {
|
|
17079
|
+
llama_kv_cache_seq_rm(kv_self, dest_seq_id, -1, -1);
|
|
17080
|
+
LLAMA_LOG_ERROR("%s: mismatched value row size (%zu != %zu, layer %d)\n", __func__, v_size_row, v_size_row_ref, il);
|
|
17081
|
+
return 0;
|
|
17082
|
+
}
|
|
16594
17083
|
|
|
16595
|
-
|
|
16596
|
-
|
|
16597
|
-
|
|
16598
|
-
|
|
16599
|
-
|
|
16600
|
-
|
|
17084
|
+
if (cell_count) {
|
|
17085
|
+
// Read and set the values for the whole cell range
|
|
17086
|
+
ggml_backend_tensor_set(kv_self.v_l[il], inp, kv_head * v_size_row, cell_count * v_size_row);
|
|
17087
|
+
inp += cell_count * v_size_row;
|
|
17088
|
+
}
|
|
17089
|
+
}
|
|
17090
|
+
} else {
|
|
17091
|
+
// For each layer, read the values for each cell (transposed)
|
|
17092
|
+
for (int il = 0; il < (int)n_layer; ++il) {
|
|
17093
|
+
// Read type of value
|
|
17094
|
+
int32_t v_type_i_ref;
|
|
17095
|
+
memcpy(&v_type_i_ref, inp, sizeof(v_type_i_ref));
|
|
17096
|
+
inp += sizeof(v_type_i_ref);
|
|
17097
|
+
const int32_t v_type_i = (int32_t)kv_self.v_l[il]->type;
|
|
17098
|
+
if (v_type_i != v_type_i_ref) {
|
|
17099
|
+
llama_kv_cache_seq_rm(kv_self, dest_seq_id, -1, -1);
|
|
17100
|
+
LLAMA_LOG_ERROR("%s: mismatched value type (%d != %d, layer %d)\n", __func__, v_type_i, v_type_i_ref, il);
|
|
17101
|
+
return 0;
|
|
17102
|
+
}
|
|
17103
|
+
|
|
17104
|
+
// Read element size of value
|
|
17105
|
+
size_t v_size_el_ref;
|
|
17106
|
+
memcpy(&v_size_el_ref, inp, sizeof(v_size_el_ref));
|
|
17107
|
+
inp += sizeof(v_size_el_ref);
|
|
17108
|
+
const size_t v_size_el = ggml_type_size(kv_self.v_l[il]->type);
|
|
17109
|
+
if (v_size_el != v_size_el_ref) {
|
|
17110
|
+
llama_kv_cache_seq_rm(kv_self, dest_seq_id, -1, -1);
|
|
17111
|
+
LLAMA_LOG_ERROR("%s: mismatched value element size (%zu != %zu, layer %d)\n", __func__, v_size_el, v_size_el_ref, il);
|
|
17112
|
+
return 0;
|
|
17113
|
+
}
|
|
17114
|
+
|
|
17115
|
+
if (cell_count) {
|
|
17116
|
+
// For each row in the transposed matrix, read the values for the whole cell range
|
|
17117
|
+
for (uint32_t j = 0; j < n_embd_v_gqa; ++j) {
|
|
17118
|
+
const size_t dst_offset = (kv_head + j * kv_size) * v_size_el;
|
|
17119
|
+
ggml_backend_tensor_set(kv_self.v_l[il], inp, dst_offset, cell_count * v_size_el);
|
|
17120
|
+
inp += cell_count * v_size_el;
|
|
17121
|
+
}
|
|
16601
17122
|
}
|
|
16602
17123
|
}
|
|
16603
17124
|
}
|
|
16604
17125
|
|
|
16605
17126
|
const size_t nread = inp - src;
|
|
17127
|
+
|
|
16606
17128
|
return nread;
|
|
16607
17129
|
}
|
|
16608
17130
|
|
|
@@ -16983,9 +17505,10 @@ int32_t llama_tokenize(
|
|
|
16983
17505
|
|
|
16984
17506
|
static std::string llama_decode_text(const std::string & text) {
|
|
16985
17507
|
std::string decoded_text;
|
|
16986
|
-
|
|
16987
|
-
|
|
16988
|
-
|
|
17508
|
+
|
|
17509
|
+
const auto cpts = unicode_cpts_from_utf8(text);
|
|
17510
|
+
for (const auto cpt : cpts) {
|
|
17511
|
+
decoded_text += unicode_utf8_to_byte(unicode_cpt_to_utf8(cpt));
|
|
16989
17512
|
}
|
|
16990
17513
|
|
|
16991
17514
|
return decoded_text;
|
|
@@ -17257,6 +17780,15 @@ static int32_t llama_chat_apply_template_internal(
|
|
|
17257
17780
|
if (add_ass) {
|
|
17258
17781
|
ss << "<|start_header_id|>assistant<|end_header_id|>\n\n";
|
|
17259
17782
|
}
|
|
17783
|
+
} else if (tmpl == "phi3" || (tmpl.find("<|assistant|>") != std::string::npos && tmpl.find("<|end|>") != std::string::npos )) {
|
|
17784
|
+
// Phi 3
|
|
17785
|
+
for (auto message : chat) {
|
|
17786
|
+
std::string role(message->role);
|
|
17787
|
+
ss << "<|" << role << "|>\n" << trim(message->content) << "<|end|>\n";
|
|
17788
|
+
}
|
|
17789
|
+
if (add_ass) {
|
|
17790
|
+
ss << "<|assistant|>\n";
|
|
17791
|
+
}
|
|
17260
17792
|
} else {
|
|
17261
17793
|
// template not supported
|
|
17262
17794
|
return -1;
|
|
@@ -17340,7 +17872,7 @@ struct llama_timings llama_get_timings(struct llama_context * ctx) {
|
|
|
17340
17872
|
/*.t_eval_ms =*/ 1e-3 * ctx->t_eval_us,
|
|
17341
17873
|
|
|
17342
17874
|
/*.n_sample =*/ std::max(1, ctx->n_sample),
|
|
17343
|
-
/*.n_p_eval =*/ std::max(
|
|
17875
|
+
/*.n_p_eval =*/ std::max(0, ctx->n_p_eval),
|
|
17344
17876
|
/*.n_eval =*/ std::max(1, ctx->n_eval),
|
|
17345
17877
|
};
|
|
17346
17878
|
|
|
@@ -17389,6 +17921,11 @@ const char * llama_print_system_info(void) {
|
|
|
17389
17921
|
s += "SSSE3 = " + std::to_string(ggml_cpu_has_ssse3()) + " | ";
|
|
17390
17922
|
s += "VSX = " + std::to_string(ggml_cpu_has_vsx()) + " | ";
|
|
17391
17923
|
s += "MATMUL_INT8 = " + std::to_string(ggml_cpu_has_matmul_int8()) + " | ";
|
|
17924
|
+
#ifdef GGML_USE_LLAMAFILE
|
|
17925
|
+
s += "LLAMAFILE = 1 | ";
|
|
17926
|
+
#else
|
|
17927
|
+
s += "LLAMAFILE = 0 | ";
|
|
17928
|
+
#endif
|
|
17392
17929
|
|
|
17393
17930
|
return s.c_str();
|
|
17394
17931
|
}
|