llama_cpp 0.5.2 → 0.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +13 -0
- data/README.md +6 -5
- data/examples/chat.rb +13 -13
- data/examples/embedding.rb +9 -9
- data/ext/llama_cpp/llama_cpp.cpp +547 -272
- data/ext/llama_cpp/src/ggml-alloc.c +14 -8
- data/ext/llama_cpp/src/ggml-alloc.h +1 -0
- data/ext/llama_cpp/src/ggml-cuda.cu +307 -127
- data/ext/llama_cpp/src/ggml-cuda.h +1 -0
- data/ext/llama_cpp/src/ggml-metal.h +4 -0
- data/ext/llama_cpp/src/ggml-metal.m +200 -94
- data/ext/llama_cpp/src/ggml-metal.metal +264 -82
- data/ext/llama_cpp/src/ggml-opencl.cpp +3 -3
- data/ext/llama_cpp/src/ggml.c +1647 -865
- data/ext/llama_cpp/src/ggml.h +143 -52
- data/ext/llama_cpp/src/llama.cpp +1427 -635
- data/ext/llama_cpp/src/llama.h +308 -119
- data/lib/llama_cpp/version.rb +2 -2
- data/lib/llama_cpp.rb +5 -9
- data/sig/llama_cpp.rbs +65 -34
- metadata +3 -3
data/ext/llama_cpp/src/llama.cpp
CHANGED
@@ -1,3 +1,4 @@
|
|
1
|
+
#define LLAMA_API_INTERNAL
|
1
2
|
#include "llama.h"
|
2
3
|
|
3
4
|
#include "ggml.h"
|
@@ -71,6 +72,7 @@
|
|
71
72
|
#include <sstream>
|
72
73
|
#include <thread>
|
73
74
|
#include <unordered_map>
|
75
|
+
#include <set>
|
74
76
|
|
75
77
|
#if defined(_MSC_VER)
|
76
78
|
#pragma warning(disable: 4244 4267) // possible loss of data
|
@@ -91,12 +93,12 @@
|
|
91
93
|
//
|
92
94
|
|
93
95
|
LLAMA_ATTRIBUTE_FORMAT(2, 3)
|
94
|
-
static void llama_log_internal (
|
95
|
-
static void llama_log_callback_default(
|
96
|
+
static void llama_log_internal (ggml_log_level level, const char* format, ...);
|
97
|
+
static void llama_log_callback_default(ggml_log_level level, const char * text, void * user_data);
|
96
98
|
|
97
|
-
#define LLAMA_LOG_INFO(...) llama_log_internal(
|
98
|
-
#define LLAMA_LOG_WARN(...) llama_log_internal(
|
99
|
-
#define LLAMA_LOG_ERROR(...) llama_log_internal(
|
99
|
+
#define LLAMA_LOG_INFO(...) llama_log_internal(GGML_LOG_LEVEL_INFO , __VA_ARGS__)
|
100
|
+
#define LLAMA_LOG_WARN(...) llama_log_internal(GGML_LOG_LEVEL_WARN , __VA_ARGS__)
|
101
|
+
#define LLAMA_LOG_ERROR(...) llama_log_internal(GGML_LOG_LEVEL_ERROR, __VA_ARGS__)
|
100
102
|
|
101
103
|
//
|
102
104
|
// helpers
|
@@ -108,7 +110,7 @@ static size_t utf8_len(char src) {
|
|
108
110
|
return lookup[highbits];
|
109
111
|
}
|
110
112
|
|
111
|
-
void replace_all(std::string & s, const std::string & search, const std::string & replace) {
|
113
|
+
static void replace_all(std::string & s, const std::string & search, const std::string & replace) {
|
112
114
|
std::string result;
|
113
115
|
for (size_t pos = 0; ; pos += search.length()) {
|
114
116
|
auto new_pos = s.find(search, pos);
|
@@ -160,17 +162,19 @@ enum llm_arch {
|
|
160
162
|
LLM_ARCH_GPTJ,
|
161
163
|
LLM_ARCH_GPTNEOX,
|
162
164
|
LLM_ARCH_MPT,
|
165
|
+
LLM_ARCH_STARCODER,
|
163
166
|
LLM_ARCH_UNKNOWN,
|
164
167
|
};
|
165
168
|
|
166
169
|
static std::map<llm_arch, std::string> LLM_ARCH_NAMES = {
|
167
|
-
{ LLM_ARCH_LLAMA,
|
168
|
-
{ LLM_ARCH_FALCON,
|
169
|
-
{ LLM_ARCH_GPT2,
|
170
|
-
{ LLM_ARCH_GPTJ,
|
171
|
-
{ LLM_ARCH_GPTNEOX,
|
172
|
-
{ LLM_ARCH_MPT,
|
173
|
-
{ LLM_ARCH_BAICHUAN,"baichuan"
|
170
|
+
{ LLM_ARCH_LLAMA, "llama" },
|
171
|
+
{ LLM_ARCH_FALCON, "falcon" },
|
172
|
+
{ LLM_ARCH_GPT2, "gpt2" },
|
173
|
+
{ LLM_ARCH_GPTJ, "gptj" },
|
174
|
+
{ LLM_ARCH_GPTNEOX, "gptneox" },
|
175
|
+
{ LLM_ARCH_MPT, "mpt" },
|
176
|
+
{ LLM_ARCH_BAICHUAN, "baichuan" },
|
177
|
+
{ LLM_ARCH_STARCODER, "starcoder" },
|
174
178
|
};
|
175
179
|
|
176
180
|
enum llm_kv {
|
@@ -218,16 +222,16 @@ enum llm_kv {
|
|
218
222
|
};
|
219
223
|
|
220
224
|
static std::map<llm_kv, std::string> LLM_KV_NAMES = {
|
221
|
-
{ LLM_KV_GENERAL_ARCHITECTURE, "general.architecture"
|
222
|
-
{ LLM_KV_GENERAL_QUANTIZATION_VERSION, "general.quantization_version"
|
223
|
-
{ LLM_KV_GENERAL_ALIGNMENT, "general.alignment"
|
224
|
-
{ LLM_KV_GENERAL_NAME, "general.name"
|
225
|
-
{ LLM_KV_GENERAL_AUTHOR, "general.author"
|
226
|
-
{ LLM_KV_GENERAL_URL, "general.url"
|
227
|
-
{ LLM_KV_GENERAL_DESCRIPTION, "general.description"
|
228
|
-
{ LLM_KV_GENERAL_LICENSE, "general.license"
|
229
|
-
{ LLM_KV_GENERAL_SOURCE_URL, "general.
|
230
|
-
{ LLM_KV_GENERAL_SOURCE_HF_REPO, "general.
|
225
|
+
{ LLM_KV_GENERAL_ARCHITECTURE, "general.architecture" },
|
226
|
+
{ LLM_KV_GENERAL_QUANTIZATION_VERSION, "general.quantization_version" },
|
227
|
+
{ LLM_KV_GENERAL_ALIGNMENT, "general.alignment" },
|
228
|
+
{ LLM_KV_GENERAL_NAME, "general.name" },
|
229
|
+
{ LLM_KV_GENERAL_AUTHOR, "general.author" },
|
230
|
+
{ LLM_KV_GENERAL_URL, "general.url" },
|
231
|
+
{ LLM_KV_GENERAL_DESCRIPTION, "general.description" },
|
232
|
+
{ LLM_KV_GENERAL_LICENSE, "general.license" },
|
233
|
+
{ LLM_KV_GENERAL_SOURCE_URL, "general.source.url" },
|
234
|
+
{ LLM_KV_GENERAL_SOURCE_HF_REPO, "general.source.huggingface.repository" },
|
231
235
|
|
232
236
|
{ LLM_KV_CONTEXT_LENGTH, "%s.context_length" },
|
233
237
|
{ LLM_KV_EMBEDDING_LENGTH, "%s.embedding_length" },
|
@@ -376,6 +380,21 @@ static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES =
|
|
376
380
|
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
377
381
|
},
|
378
382
|
},
|
383
|
+
{
|
384
|
+
LLM_ARCH_STARCODER,
|
385
|
+
{
|
386
|
+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
387
|
+
{ LLM_TENSOR_POS_EMBD, "position_embd" },
|
388
|
+
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
389
|
+
{ LLM_TENSOR_OUTPUT, "output" },
|
390
|
+
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
391
|
+
{ LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
|
392
|
+
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
393
|
+
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
394
|
+
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
395
|
+
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
396
|
+
},
|
397
|
+
},
|
379
398
|
{
|
380
399
|
LLM_ARCH_UNKNOWN,
|
381
400
|
{
|
@@ -430,7 +449,7 @@ struct LLM_TN {
|
|
430
449
|
//
|
431
450
|
|
432
451
|
#define GGUF_GET_KEY(ctx, dst, func, type, req, key) \
|
433
|
-
{ \
|
452
|
+
do { \
|
434
453
|
const std::string skey(key); \
|
435
454
|
const int kid = gguf_find_key(ctx, skey.c_str()); \
|
436
455
|
if (kid >= 0) { \
|
@@ -442,7 +461,7 @@ struct LLM_TN {
|
|
442
461
|
} else if (req) { \
|
443
462
|
throw std::runtime_error(format("key not found in model: %s", skey.c_str())); \
|
444
463
|
} \
|
445
|
-
}
|
464
|
+
} while (0)
|
446
465
|
|
447
466
|
//
|
448
467
|
// ggml helpers
|
@@ -680,6 +699,7 @@ struct llama_mmap {
|
|
680
699
|
if (!PrefetchVirtualMemory(GetCurrentProcess(), 1, &range, 0)) {
|
681
700
|
fprintf(stderr, "warning: PrefetchVirtualMemory failed: %s\n",
|
682
701
|
llama_format_win_err(GetLastError()).c_str());
|
702
|
+
}
|
683
703
|
}
|
684
704
|
#else
|
685
705
|
#pragma message("warning: You are building for pre-Windows 8; prefetch not supported")
|
@@ -862,10 +882,10 @@ static void llama_nop(struct ggml_tensor * tensor) { // don't offload by default
|
|
862
882
|
|
863
883
|
static std::string llama_token_to_str(const struct llama_context * ctx, llama_token token) {
|
864
884
|
std::vector<char> result(8, 0);
|
865
|
-
const int n_tokens = llama_token_to_piece(ctx, token, result.data(), result.size());
|
885
|
+
const int n_tokens = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size());
|
866
886
|
if (n_tokens < 0) {
|
867
887
|
result.resize(-n_tokens);
|
868
|
-
int check = llama_token_to_piece(ctx, token, result.data(), result.size());
|
888
|
+
int check = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size());
|
869
889
|
GGML_ASSERT(check == -n_tokens);
|
870
890
|
} else {
|
871
891
|
result.resize(n_tokens);
|
@@ -880,7 +900,7 @@ static std::string llama_token_to_str(const struct llama_context * ctx, llama_to
|
|
880
900
|
|
881
901
|
struct llama_state {
|
882
902
|
// We save the log callback globally
|
883
|
-
|
903
|
+
ggml_log_callback log_callback = llama_log_callback_default;
|
884
904
|
void * log_callback_user_data = nullptr;
|
885
905
|
};
|
886
906
|
|
@@ -889,9 +909,11 @@ static llama_state g_state;
|
|
889
909
|
// available llama models
|
890
910
|
enum e_model {
|
891
911
|
MODEL_UNKNOWN,
|
912
|
+
MODEL_1B,
|
892
913
|
MODEL_3B,
|
893
914
|
MODEL_7B,
|
894
915
|
MODEL_13B,
|
916
|
+
MODEL_15B,
|
895
917
|
MODEL_30B,
|
896
918
|
MODEL_34B,
|
897
919
|
MODEL_40B,
|
@@ -901,24 +923,24 @@ enum e_model {
|
|
901
923
|
|
902
924
|
static const size_t kB = 1024;
|
903
925
|
static const size_t MB = kB*kB;
|
926
|
+
static const size_t GB = kB*kB*kB;
|
904
927
|
|
905
|
-
// default hparams (LLaMA 7B)
|
906
928
|
struct llama_hparams {
|
907
|
-
|
908
|
-
uint32_t
|
909
|
-
uint32_t
|
910
|
-
uint32_t n_embd
|
911
|
-
uint32_t n_head
|
912
|
-
uint32_t n_head_kv
|
913
|
-
uint32_t n_layer
|
914
|
-
uint32_t n_rot
|
915
|
-
uint32_t n_ff
|
916
|
-
|
917
|
-
float f_norm_eps
|
918
|
-
float f_norm_rms_eps
|
919
|
-
|
920
|
-
float
|
921
|
-
float
|
929
|
+
bool vocab_only;
|
930
|
+
uint32_t n_vocab;
|
931
|
+
uint32_t n_ctx_train; // context size the model was trained on
|
932
|
+
uint32_t n_embd;
|
933
|
+
uint32_t n_head;
|
934
|
+
uint32_t n_head_kv;
|
935
|
+
uint32_t n_layer;
|
936
|
+
uint32_t n_rot;
|
937
|
+
uint32_t n_ff;
|
938
|
+
|
939
|
+
float f_norm_eps;
|
940
|
+
float f_norm_rms_eps;
|
941
|
+
|
942
|
+
float rope_freq_base_train;
|
943
|
+
float rope_freq_scale_train;
|
922
944
|
|
923
945
|
bool operator!=(const llama_hparams & other) const {
|
924
946
|
return static_cast<bool>(memcmp(this, &other, sizeof(llama_hparams))); // NOLINT
|
@@ -935,15 +957,18 @@ struct llama_hparams {
|
|
935
957
|
uint32_t n_embd_gqa() const {
|
936
958
|
return n_embd/n_gqa();
|
937
959
|
}
|
960
|
+
};
|
938
961
|
|
939
|
-
|
940
|
-
|
941
|
-
|
942
|
-
|
943
|
-
|
944
|
-
|
945
|
-
|
946
|
-
|
962
|
+
struct llama_cparams {
|
963
|
+
uint32_t n_ctx; // context size used during inference
|
964
|
+
uint32_t n_batch;
|
965
|
+
uint32_t n_threads; // number of threads to use for generation
|
966
|
+
uint32_t n_threads_batch; // number of threads to use for batch processing
|
967
|
+
|
968
|
+
float rope_freq_base;
|
969
|
+
float rope_freq_scale;
|
970
|
+
|
971
|
+
bool mul_mat_q;
|
947
972
|
};
|
948
973
|
|
949
974
|
struct llama_layer {
|
@@ -960,16 +985,47 @@ struct llama_layer {
|
|
960
985
|
struct ggml_tensor * wo;
|
961
986
|
struct ggml_tensor * wqkv;
|
962
987
|
|
988
|
+
// attention bias
|
989
|
+
struct ggml_tensor * bo;
|
990
|
+
struct ggml_tensor * bqkv;
|
991
|
+
|
963
992
|
// normalization
|
964
993
|
struct ggml_tensor * ffn_norm;
|
994
|
+
struct ggml_tensor * ffn_norm_b;
|
965
995
|
|
966
996
|
// ff
|
967
997
|
struct ggml_tensor * w1; // ffn_gate
|
968
998
|
struct ggml_tensor * w2; // ffn_down
|
969
999
|
struct ggml_tensor * w3; // ffn_up
|
1000
|
+
|
1001
|
+
// ff bias
|
1002
|
+
struct ggml_tensor * b2; // ffn_down
|
1003
|
+
struct ggml_tensor * b3; // ffn_up
|
1004
|
+
};
|
1005
|
+
|
1006
|
+
struct llama_kv_cell {
|
1007
|
+
llama_pos pos = -1;
|
1008
|
+
llama_pos delta = 0;
|
1009
|
+
|
1010
|
+
std::set<llama_seq_id> seq_id;
|
1011
|
+
|
1012
|
+
bool has_seq_id(const llama_seq_id & id) const {
|
1013
|
+
return seq_id.find(id) != seq_id.end();
|
1014
|
+
}
|
970
1015
|
};
|
971
1016
|
|
1017
|
+
// ring-buffer of cached KV data
|
972
1018
|
struct llama_kv_cache {
|
1019
|
+
bool has_shift = false;
|
1020
|
+
|
1021
|
+
uint32_t head = 0;
|
1022
|
+
uint32_t size = 0;
|
1023
|
+
|
1024
|
+
// computed before each graph build
|
1025
|
+
uint32_t n = 0;
|
1026
|
+
|
1027
|
+
std::vector<llama_kv_cell> cells;
|
1028
|
+
|
973
1029
|
struct ggml_tensor * k = NULL;
|
974
1030
|
struct ggml_tensor * v = NULL;
|
975
1031
|
|
@@ -977,8 +1033,6 @@ struct llama_kv_cache {
|
|
977
1033
|
|
978
1034
|
llama_buffer buf;
|
979
1035
|
|
980
|
-
int n; // number of tokens currently in the cache
|
981
|
-
|
982
1036
|
~llama_kv_cache() {
|
983
1037
|
if (ctx) {
|
984
1038
|
ggml_free(ctx);
|
@@ -1040,10 +1094,11 @@ struct llama_model {
|
|
1040
1094
|
|
1041
1095
|
std::string name = "n/a";
|
1042
1096
|
|
1043
|
-
llama_hparams hparams;
|
1097
|
+
llama_hparams hparams = {};
|
1044
1098
|
llama_vocab vocab;
|
1045
1099
|
|
1046
1100
|
struct ggml_tensor * tok_embeddings;
|
1101
|
+
struct ggml_tensor * pos_embeddings;
|
1047
1102
|
|
1048
1103
|
struct ggml_tensor * output_norm;
|
1049
1104
|
struct ggml_tensor * output_norm_b;
|
@@ -1091,11 +1146,8 @@ struct llama_model {
|
|
1091
1146
|
};
|
1092
1147
|
|
1093
1148
|
struct llama_context {
|
1094
|
-
llama_context(const llama_model & model) : model(model),
|
1149
|
+
llama_context(const llama_model & model) : model(model), t_start_us(model.t_start_us), t_load_us(model.t_load_us) {}
|
1095
1150
|
~llama_context() {
|
1096
|
-
if (model_owner) {
|
1097
|
-
delete &model;
|
1098
|
-
}
|
1099
1151
|
#ifdef GGML_USE_METAL
|
1100
1152
|
if (ctx_metal) {
|
1101
1153
|
ggml_metal_free(ctx_metal);
|
@@ -1106,27 +1158,26 @@ struct llama_context {
|
|
1106
1158
|
}
|
1107
1159
|
}
|
1108
1160
|
|
1161
|
+
llama_cparams cparams;
|
1162
|
+
|
1163
|
+
const llama_model & model;
|
1164
|
+
|
1165
|
+
// key + value cache for the self attention
|
1166
|
+
struct llama_kv_cache kv_self;
|
1167
|
+
|
1109
1168
|
std::mt19937 rng;
|
1110
1169
|
|
1111
1170
|
bool has_evaluated_once = false;
|
1112
1171
|
|
1172
|
+
int64_t t_start_us;
|
1173
|
+
int64_t t_load_us;
|
1113
1174
|
int64_t t_sample_us = 0;
|
1114
|
-
int64_t t_eval_us = 0;
|
1115
1175
|
int64_t t_p_eval_us = 0;
|
1176
|
+
int64_t t_eval_us = 0;
|
1116
1177
|
|
1117
1178
|
int32_t n_sample = 0; // number of tokens sampled
|
1118
|
-
int32_t n_eval = 0; // number of eval calls
|
1119
1179
|
int32_t n_p_eval = 0; // number of tokens in eval calls for the prompt (with batch size > 1)
|
1120
|
-
|
1121
|
-
const llama_model & model;
|
1122
|
-
|
1123
|
-
bool model_owner = false;
|
1124
|
-
|
1125
|
-
int64_t t_load_us;
|
1126
|
-
int64_t t_start_us;
|
1127
|
-
|
1128
|
-
// key + value cache for the self attention
|
1129
|
-
struct llama_kv_cache kv_self;
|
1180
|
+
int32_t n_eval = 0; // number of eval calls
|
1130
1181
|
|
1131
1182
|
// decode output (2-dimensional array: [n_tokens][n_vocab])
|
1132
1183
|
std::vector<float> logits;
|
@@ -1161,16 +1212,23 @@ static bool llama_kv_cache_init(
|
|
1161
1212
|
const struct llama_hparams & hparams,
|
1162
1213
|
struct llama_kv_cache & cache,
|
1163
1214
|
ggml_type wtype,
|
1164
|
-
|
1215
|
+
uint32_t n_ctx,
|
1165
1216
|
int n_gpu_layers) {
|
1166
|
-
const
|
1167
|
-
const
|
1217
|
+
const uint32_t n_embd = hparams.n_embd_gqa();
|
1218
|
+
const uint32_t n_layer = hparams.n_layer;
|
1168
1219
|
|
1169
1220
|
const int64_t n_mem = n_layer*n_ctx;
|
1170
1221
|
const int64_t n_elements = n_embd*n_mem;
|
1171
1222
|
|
1223
|
+
cache.has_shift = false;
|
1224
|
+
|
1225
|
+
cache.head = 0;
|
1226
|
+
cache.size = n_ctx;
|
1227
|
+
|
1228
|
+
cache.cells.clear();
|
1229
|
+
cache.cells.resize(n_ctx);
|
1230
|
+
|
1172
1231
|
cache.buf.resize(2u*n_elements*ggml_type_size(wtype) + 2u*MB);
|
1173
|
-
cache.n = 0;
|
1174
1232
|
|
1175
1233
|
struct ggml_init_params params;
|
1176
1234
|
params.mem_size = cache.buf.size;
|
@@ -1191,17 +1249,154 @@ static bool llama_kv_cache_init(
|
|
1191
1249
|
|
1192
1250
|
(void) n_gpu_layers;
|
1193
1251
|
#ifdef GGML_USE_CUBLAS
|
1194
|
-
|
1252
|
+
size_t vram_kv_cache = 0;
|
1253
|
+
|
1254
|
+
if (n_gpu_layers > (int)n_layer + 1) {
|
1195
1255
|
ggml_cuda_assign_buffers_no_scratch(cache.v);
|
1256
|
+
LLAMA_LOG_INFO("%s: offloading v cache to GPU\n", __func__);
|
1257
|
+
vram_kv_cache += ggml_nbytes(cache.v);
|
1196
1258
|
}
|
1197
|
-
if (n_gpu_layers > n_layer + 2) {
|
1259
|
+
if (n_gpu_layers > (int)n_layer + 2) {
|
1198
1260
|
ggml_cuda_assign_buffers_no_scratch(cache.k);
|
1261
|
+
LLAMA_LOG_INFO("%s: offloading k cache to GPU\n", __func__);
|
1262
|
+
vram_kv_cache += ggml_nbytes(cache.k);
|
1263
|
+
}
|
1264
|
+
if (vram_kv_cache > 0) {
|
1265
|
+
LLAMA_LOG_INFO("%s: VRAM kv self = %.2f MB\n", __func__, vram_kv_cache / 1024.0 / 1024.0);
|
1199
1266
|
}
|
1200
1267
|
#endif // GGML_USE_CUBLAS
|
1201
1268
|
|
1202
1269
|
return true;
|
1203
1270
|
}
|
1204
1271
|
|
1272
|
+
// find an empty slot of size "n_tokens" in the cache
|
1273
|
+
// updates the cache head
|
1274
|
+
static bool llama_kv_cache_find_slot(
|
1275
|
+
struct llama_kv_cache & cache,
|
1276
|
+
const struct llama_batch & batch) {
|
1277
|
+
const uint32_t n_ctx = cache.size;
|
1278
|
+
const uint32_t n_tokens = batch.n_tokens;
|
1279
|
+
|
1280
|
+
if (n_tokens > n_ctx) {
|
1281
|
+
LLAMA_LOG_ERROR("%s: n_tokens=%d > n_ctx=%d\n", __func__, n_tokens, n_ctx);
|
1282
|
+
return false;
|
1283
|
+
}
|
1284
|
+
|
1285
|
+
uint32_t n_tested = 0;
|
1286
|
+
|
1287
|
+
while (true) {
|
1288
|
+
if (cache.head + n_tokens > n_ctx) {
|
1289
|
+
cache.head = 0;
|
1290
|
+
n_tested += n_ctx - cache.head;
|
1291
|
+
continue;
|
1292
|
+
}
|
1293
|
+
|
1294
|
+
bool found = true;
|
1295
|
+
for (uint32_t i = 0; i < n_tokens; i++) {
|
1296
|
+
if (cache.cells[cache.head + i].pos >= 0) {
|
1297
|
+
found = false;
|
1298
|
+
cache.head += i + 1;
|
1299
|
+
n_tested += i + 1;
|
1300
|
+
break;
|
1301
|
+
}
|
1302
|
+
}
|
1303
|
+
|
1304
|
+
if (found) {
|
1305
|
+
break;
|
1306
|
+
}
|
1307
|
+
|
1308
|
+
if (n_tested >= n_ctx) {
|
1309
|
+
//LLAMA_LOG_ERROR("%s: failed to find a slot for %d tokens\n", __func__, n_tokens);
|
1310
|
+
return false;
|
1311
|
+
}
|
1312
|
+
}
|
1313
|
+
|
1314
|
+
for (uint32_t i = 0; i < n_tokens; i++) {
|
1315
|
+
cache.cells[cache.head + i].pos = batch.pos[i];
|
1316
|
+
cache.cells[cache.head + i].seq_id.insert(batch.seq_id[i]);
|
1317
|
+
}
|
1318
|
+
|
1319
|
+
return true;
|
1320
|
+
}
|
1321
|
+
|
1322
|
+
// find how many cells are currently in use
|
1323
|
+
static int32_t llama_kv_cache_cell_max(const struct llama_kv_cache & cache) {
|
1324
|
+
for (uint32_t i = cache.size - 1; i > 0; --i) {
|
1325
|
+
if (cache.cells[i].pos >= 0 && !cache.cells[i].seq_id.empty()) {
|
1326
|
+
return i + 1;
|
1327
|
+
}
|
1328
|
+
}
|
1329
|
+
|
1330
|
+
return 0;
|
1331
|
+
}
|
1332
|
+
|
1333
|
+
static void llama_kv_cache_tokens_rm(struct llama_kv_cache & cache, int32_t c0, int32_t c1) {
|
1334
|
+
if (c0 < 0) c0 = 0;
|
1335
|
+
if (c1 < 0) c1 = cache.size;
|
1336
|
+
|
1337
|
+
for (int32_t i = c0; i < c1; ++i) {
|
1338
|
+
cache.cells[i].pos = -1;
|
1339
|
+
cache.cells[i].seq_id.clear();
|
1340
|
+
}
|
1341
|
+
}
|
1342
|
+
|
1343
|
+
static void llama_kv_cache_seq_rm(
|
1344
|
+
struct llama_kv_cache & cache,
|
1345
|
+
llama_seq_id seq_id,
|
1346
|
+
llama_pos p0,
|
1347
|
+
llama_pos p1) {
|
1348
|
+
for (uint32_t i = 0; i < cache.size; ++i) {
|
1349
|
+
if (cache.cells[i].has_seq_id(seq_id) && cache.cells[i].pos >= p0 && cache.cells[i].pos < p1) {
|
1350
|
+
cache.cells[i].seq_id.erase(seq_id);
|
1351
|
+
if (cache.cells[i].seq_id.empty()) {
|
1352
|
+
cache.cells[i].pos = -1;
|
1353
|
+
}
|
1354
|
+
}
|
1355
|
+
}
|
1356
|
+
}
|
1357
|
+
|
1358
|
+
static void llama_kv_cache_seq_cp(
|
1359
|
+
struct llama_kv_cache & cache,
|
1360
|
+
llama_seq_id seq_id_src,
|
1361
|
+
llama_seq_id seq_id_dst,
|
1362
|
+
llama_pos p0,
|
1363
|
+
llama_pos p1) {
|
1364
|
+
for (uint32_t i = 0; i < cache.size; ++i) {
|
1365
|
+
if (cache.cells[i].has_seq_id(seq_id_src) && cache.cells[i].pos >= p0 && cache.cells[i].pos < p1) {
|
1366
|
+
cache.cells[i].seq_id.insert(seq_id_dst);
|
1367
|
+
}
|
1368
|
+
}
|
1369
|
+
}
|
1370
|
+
|
1371
|
+
static void llama_kv_cache_seq_keep(struct llama_kv_cache & cache, llama_seq_id seq_id) {
|
1372
|
+
for (uint32_t i = 0; i < cache.size; ++i) {
|
1373
|
+
if (!cache.cells[i].has_seq_id(seq_id)) {
|
1374
|
+
cache.cells[i].pos = -1;
|
1375
|
+
cache.cells[i].seq_id.clear();
|
1376
|
+
}
|
1377
|
+
}
|
1378
|
+
}
|
1379
|
+
|
1380
|
+
static void llama_kv_cache_seq_shift(
|
1381
|
+
struct llama_kv_cache & cache,
|
1382
|
+
llama_seq_id seq_id,
|
1383
|
+
llama_pos p0,
|
1384
|
+
llama_pos p1,
|
1385
|
+
llama_pos delta) {
|
1386
|
+
for (uint32_t i = 0; i < cache.size; ++i) {
|
1387
|
+
if (cache.cells[i].has_seq_id(seq_id) && cache.cells[i].pos >= p0 && cache.cells[i].pos < p1) {
|
1388
|
+
cache.cells[i].pos += delta;
|
1389
|
+
if (cache.cells[i].pos < 0) {
|
1390
|
+
cache.cells[i].pos = -1;
|
1391
|
+
cache.cells[i].seq_id.clear();
|
1392
|
+
} else {
|
1393
|
+
cache.has_shift = true;
|
1394
|
+
cache.cells[i].delta = delta;
|
1395
|
+
}
|
1396
|
+
}
|
1397
|
+
}
|
1398
|
+
}
|
1399
|
+
|
1205
1400
|
//
|
1206
1401
|
// model loading and saving
|
1207
1402
|
//
|
@@ -1244,6 +1439,7 @@ struct llama_model_loader {
|
|
1244
1439
|
int n_created = 0;
|
1245
1440
|
|
1246
1441
|
int64_t n_elements = 0;
|
1442
|
+
size_t n_bytes = 0;
|
1247
1443
|
|
1248
1444
|
bool use_mmap = false;
|
1249
1445
|
|
@@ -1276,6 +1472,7 @@ struct llama_model_loader {
|
|
1276
1472
|
const char * name = gguf_get_tensor_name(ctx_gguf, i);
|
1277
1473
|
struct ggml_tensor * t = ggml_get_tensor(ctx_meta, name);
|
1278
1474
|
n_elements += ggml_nelements(t);
|
1475
|
+
n_bytes += ggml_nbytes(t);
|
1279
1476
|
}
|
1280
1477
|
|
1281
1478
|
LLAMA_LOG_INFO("%s: loaded meta data with %d key-value pairs and %d tensors from %s (version %s)\n",
|
@@ -1521,7 +1718,7 @@ struct llama_model_loader {
|
|
1521
1718
|
lmlock->grow_to(size_lock);
|
1522
1719
|
}
|
1523
1720
|
break;
|
1524
|
-
#
|
1721
|
+
#ifdef GGML_USE_CUBLAS
|
1525
1722
|
case GGML_BACKEND_GPU:
|
1526
1723
|
case GGML_BACKEND_GPU_SPLIT:
|
1527
1724
|
// old code:
|
@@ -1554,7 +1751,15 @@ struct llama_model_loader {
|
|
1554
1751
|
// load LLaMA models
|
1555
1752
|
//
|
1556
1753
|
|
1557
|
-
std::string
|
1754
|
+
static std::string llama_model_arch_name(llm_arch arch) {
|
1755
|
+
auto it = LLM_ARCH_NAMES.find(arch);
|
1756
|
+
if (it == LLM_ARCH_NAMES.end()) {
|
1757
|
+
return "unknown";
|
1758
|
+
}
|
1759
|
+
return it->second;
|
1760
|
+
}
|
1761
|
+
|
1762
|
+
static std::string llama_model_ftype_name(llama_ftype ftype) {
|
1558
1763
|
if (ftype & LLAMA_FTYPE_GUESSED) {
|
1559
1764
|
return llama_model_ftype_name((enum llama_ftype) (ftype & ~LLAMA_FTYPE_GUESSED)) + " (guessed)";
|
1560
1765
|
}
|
@@ -1587,9 +1792,11 @@ std::string llama_model_ftype_name(enum llama_ftype ftype) {
|
|
1587
1792
|
|
1588
1793
|
static const char * llama_model_type_name(e_model type) {
|
1589
1794
|
switch (type) {
|
1795
|
+
case MODEL_1B: return "1B";
|
1590
1796
|
case MODEL_3B: return "3B";
|
1591
1797
|
case MODEL_7B: return "7B";
|
1592
1798
|
case MODEL_13B: return "13B";
|
1799
|
+
case MODEL_15B: return "15B";
|
1593
1800
|
case MODEL_30B: return "30B";
|
1594
1801
|
case MODEL_34B: return "34B";
|
1595
1802
|
case MODEL_40B: return "40B";
|
@@ -1608,10 +1815,7 @@ static void llm_load_arch(llama_model_loader & ml, llama_model & model) {
|
|
1608
1815
|
|
1609
1816
|
static void llm_load_hparams(
|
1610
1817
|
llama_model_loader & ml,
|
1611
|
-
llama_model & model
|
1612
|
-
int n_ctx,
|
1613
|
-
float rope_freq_base,
|
1614
|
-
float rope_freq_scale) {
|
1818
|
+
llama_model & model) {
|
1615
1819
|
struct gguf_context * ctx = ml.ctx_gguf;
|
1616
1820
|
|
1617
1821
|
const auto kv = LLM_KV(model.arch);
|
@@ -1622,40 +1826,25 @@ static void llm_load_hparams(
|
|
1622
1826
|
GGUF_GET_KEY(ctx, model.name, gguf_get_val_str, GGUF_TYPE_STRING, false, kv(LLM_KV_GENERAL_NAME));
|
1623
1827
|
|
1624
1828
|
// get hparams kv
|
1625
|
-
GGUF_GET_KEY(ctx, hparams.n_vocab, gguf_get_arr_n, GGUF_TYPE_ARRAY,
|
1626
|
-
GGUF_GET_KEY(ctx, hparams.n_ctx_train, gguf_get_val_u32, GGUF_TYPE_UINT32,
|
1627
|
-
GGUF_GET_KEY(ctx, hparams.n_embd, gguf_get_val_u32, GGUF_TYPE_UINT32,
|
1628
|
-
GGUF_GET_KEY(ctx, hparams.n_ff, gguf_get_val_u32, GGUF_TYPE_UINT32,
|
1629
|
-
GGUF_GET_KEY(ctx, hparams.n_head, gguf_get_val_u32, GGUF_TYPE_UINT32,
|
1630
|
-
GGUF_GET_KEY(ctx, hparams.n_layer, gguf_get_val_u32, GGUF_TYPE_UINT32,
|
1829
|
+
GGUF_GET_KEY(ctx, hparams.n_vocab, gguf_get_arr_n, GGUF_TYPE_ARRAY, true, kv(LLM_KV_TOKENIZER_LIST));
|
1830
|
+
GGUF_GET_KEY(ctx, hparams.n_ctx_train, gguf_get_val_u32, GGUF_TYPE_UINT32, true, kv(LLM_KV_CONTEXT_LENGTH));
|
1831
|
+
GGUF_GET_KEY(ctx, hparams.n_embd, gguf_get_val_u32, GGUF_TYPE_UINT32, true, kv(LLM_KV_EMBEDDING_LENGTH));
|
1832
|
+
GGUF_GET_KEY(ctx, hparams.n_ff, gguf_get_val_u32, GGUF_TYPE_UINT32, true, kv(LLM_KV_FEED_FORWARD_LENGTH));
|
1833
|
+
GGUF_GET_KEY(ctx, hparams.n_head, gguf_get_val_u32, GGUF_TYPE_UINT32, true, kv(LLM_KV_ATTENTION_HEAD_COUNT));
|
1834
|
+
GGUF_GET_KEY(ctx, hparams.n_layer, gguf_get_val_u32, GGUF_TYPE_UINT32, true, kv(LLM_KV_BLOCK_COUNT));
|
1631
1835
|
|
1632
1836
|
// n_head_kv is optional, default to n_head
|
1633
1837
|
hparams.n_head_kv = hparams.n_head;
|
1634
1838
|
GGUF_GET_KEY(ctx, hparams.n_head_kv, gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_ATTENTION_HEAD_COUNT_KV));
|
1635
1839
|
|
1636
|
-
//
|
1637
|
-
|
1638
|
-
|
1639
|
-
|
1640
|
-
llama_context_params defaults = llama_context_default_params();
|
1641
|
-
|
1642
|
-
// rope_freq_base
|
1643
|
-
{
|
1644
|
-
float ropebase = 10000.0f;
|
1645
|
-
GGUF_GET_KEY(ctx, ropebase, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ROPE_FREQ_BASE));
|
1646
|
-
if (ropebase != 10000.0f && rope_freq_base == defaults.rope_freq_base) {
|
1647
|
-
rope_freq_base = ropebase;
|
1648
|
-
}
|
1649
|
-
}
|
1840
|
+
// rope_freq_base (optional)
|
1841
|
+
hparams.rope_freq_base_train = 10000.0f;
|
1842
|
+
GGUF_GET_KEY(ctx, hparams.rope_freq_base_train, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ROPE_FREQ_BASE));
|
1650
1843
|
|
1651
1844
|
// rope_freq_scale (inverse of the kv) is optional
|
1652
|
-
|
1653
|
-
|
1654
|
-
|
1655
|
-
if (ropescale != 1.0f && rope_freq_scale == defaults.rope_freq_scale) {
|
1656
|
-
rope_freq_scale = 1.0f/ropescale;
|
1657
|
-
}
|
1658
|
-
}
|
1845
|
+
float ropescale = 1.0f;
|
1846
|
+
GGUF_GET_KEY(ctx, ropescale, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ROPE_SCALE_LINEAR));
|
1847
|
+
hparams.rope_freq_scale_train = 1.0f/ropescale;
|
1659
1848
|
|
1660
1849
|
// sanity check for n_rot (optional)
|
1661
1850
|
{
|
@@ -1707,14 +1896,21 @@ static void llm_load_hparams(
|
|
1707
1896
|
default: model.type = e_model::MODEL_UNKNOWN;
|
1708
1897
|
}
|
1709
1898
|
} break;
|
1899
|
+
case LLM_ARCH_STARCODER:
|
1900
|
+
{
|
1901
|
+
GGUF_GET_KEY(ctx, hparams.f_norm_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_LAYERNORM_EPS));
|
1902
|
+
switch (hparams.n_layer) {
|
1903
|
+
case 24: model.type = e_model::MODEL_1B; break;
|
1904
|
+
case 36: model.type = e_model::MODEL_3B; break;
|
1905
|
+
case 42: model.type = e_model::MODEL_7B; break;
|
1906
|
+
case 40: model.type = e_model::MODEL_15B; break;
|
1907
|
+
default: model.type = e_model::MODEL_UNKNOWN;
|
1908
|
+
}
|
1909
|
+
} break;
|
1710
1910
|
default: (void)0;
|
1711
|
-
}
|
1911
|
+
}
|
1712
1912
|
|
1713
1913
|
model.ftype = ml.ftype;
|
1714
|
-
|
1715
|
-
hparams.n_ctx = n_ctx;
|
1716
|
-
hparams.rope_freq_base = rope_freq_base;
|
1717
|
-
hparams.rope_freq_scale = rope_freq_scale;
|
1718
1914
|
}
|
1719
1915
|
|
1720
1916
|
// TODO: This should probably be in llama.h
|
@@ -1735,20 +1931,18 @@ static void llm_load_vocab(
|
|
1735
1931
|
throw std::runtime_error("cannot find tokenizer vocab in model file\n");
|
1736
1932
|
}
|
1737
1933
|
|
1934
|
+
const float * scores = nullptr;
|
1738
1935
|
const int score_idx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_SCORES).c_str());
|
1739
|
-
if (score_idx
|
1740
|
-
|
1936
|
+
if (score_idx != -1) {
|
1937
|
+
scores = (const float * ) gguf_get_arr_data(ctx, score_idx);
|
1741
1938
|
}
|
1742
1939
|
|
1743
|
-
const
|
1744
|
-
|
1940
|
+
const int * toktypes = nullptr;
|
1745
1941
|
const int toktype_idx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_TOKEN_TYPE).c_str());
|
1746
|
-
if (toktype_idx
|
1747
|
-
|
1942
|
+
if (toktype_idx != -1) {
|
1943
|
+
toktypes = (const int * ) gguf_get_arr_data(ctx, toktype_idx);
|
1748
1944
|
}
|
1749
1945
|
|
1750
|
-
const int * toktypes = (const int * ) gguf_get_arr_data(ctx, toktype_idx);
|
1751
|
-
|
1752
1946
|
// determine vocab type
|
1753
1947
|
{
|
1754
1948
|
std::string tokenizer_name;
|
@@ -1816,8 +2010,8 @@ static void llm_load_vocab(
|
|
1816
2010
|
|
1817
2011
|
auto & token_data = vocab.id_to_token[i];
|
1818
2012
|
token_data.text = std::move(word);
|
1819
|
-
token_data.score = scores[i];
|
1820
|
-
token_data.type = (llama_token_type) toktypes[i];
|
2013
|
+
token_data.score = scores ? scores[i] : 0.0f;
|
2014
|
+
token_data.type = toktypes ? (llama_token_type) toktypes[i] : LLAMA_TOKEN_TYPE_NORMAL;
|
1821
2015
|
}
|
1822
2016
|
|
1823
2017
|
// determine the newline token: LLaMA "<0x0A>" == 10 == '\n', Falcon 193 == '\n'
|
@@ -1840,27 +2034,31 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
|
|
1840
2034
|
const auto & vocab = model.vocab;
|
1841
2035
|
|
1842
2036
|
// hparams
|
1843
|
-
LLAMA_LOG_INFO("%s: format
|
1844
|
-
LLAMA_LOG_INFO("%s: arch
|
1845
|
-
LLAMA_LOG_INFO("%s: vocab type
|
1846
|
-
LLAMA_LOG_INFO("%s: n_vocab
|
1847
|
-
LLAMA_LOG_INFO("%s: n_merges
|
1848
|
-
LLAMA_LOG_INFO("%s: n_ctx_train
|
1849
|
-
LLAMA_LOG_INFO("%s:
|
1850
|
-
LLAMA_LOG_INFO("%s:
|
1851
|
-
LLAMA_LOG_INFO("%s:
|
1852
|
-
LLAMA_LOG_INFO("%s:
|
1853
|
-
LLAMA_LOG_INFO("%s:
|
1854
|
-
LLAMA_LOG_INFO("%s:
|
1855
|
-
LLAMA_LOG_INFO("%s:
|
1856
|
-
LLAMA_LOG_INFO("%s:
|
1857
|
-
LLAMA_LOG_INFO("%s:
|
1858
|
-
LLAMA_LOG_INFO("%s:
|
1859
|
-
LLAMA_LOG_INFO("%s:
|
1860
|
-
LLAMA_LOG_INFO("%s:
|
1861
|
-
LLAMA_LOG_INFO("%s: model
|
1862
|
-
LLAMA_LOG_INFO("%s: model
|
1863
|
-
|
2037
|
+
LLAMA_LOG_INFO("%s: format = %s\n", __func__, llama_file_version_name(ml.fver));
|
2038
|
+
LLAMA_LOG_INFO("%s: arch = %s\n", __func__, LLM_ARCH_NAMES.at(model.arch).c_str());
|
2039
|
+
LLAMA_LOG_INFO("%s: vocab type = %s\n", __func__, vocab.type == LLAMA_VOCAB_TYPE_SPM ? "SPM" : "BPE"); // TODO: fix
|
2040
|
+
LLAMA_LOG_INFO("%s: n_vocab = %u\n", __func__, hparams.n_vocab);
|
2041
|
+
LLAMA_LOG_INFO("%s: n_merges = %u\n", __func__, (int) vocab.bpe_ranks.size());
|
2042
|
+
LLAMA_LOG_INFO("%s: n_ctx_train = %u\n", __func__, hparams.n_ctx_train);
|
2043
|
+
LLAMA_LOG_INFO("%s: n_embd = %u\n", __func__, hparams.n_embd);
|
2044
|
+
LLAMA_LOG_INFO("%s: n_head = %u\n", __func__, hparams.n_head);
|
2045
|
+
LLAMA_LOG_INFO("%s: n_head_kv = %u\n", __func__, hparams.n_head_kv);
|
2046
|
+
LLAMA_LOG_INFO("%s: n_layer = %u\n", __func__, hparams.n_layer);
|
2047
|
+
LLAMA_LOG_INFO("%s: n_rot = %u\n", __func__, hparams.n_rot); // a.k.a. n_embd_head, n_head_dim
|
2048
|
+
LLAMA_LOG_INFO("%s: n_gqa = %u\n", __func__, hparams.n_gqa());
|
2049
|
+
LLAMA_LOG_INFO("%s: f_norm_eps = %.1e\n", __func__, hparams.f_norm_eps);
|
2050
|
+
LLAMA_LOG_INFO("%s: f_norm_rms_eps = %.1e\n", __func__, hparams.f_norm_rms_eps);
|
2051
|
+
LLAMA_LOG_INFO("%s: n_ff = %u\n", __func__, hparams.n_ff);
|
2052
|
+
LLAMA_LOG_INFO("%s: freq_base_train = %.1f\n", __func__, hparams.rope_freq_base_train);
|
2053
|
+
LLAMA_LOG_INFO("%s: freq_scale_train = %g\n", __func__, hparams.rope_freq_scale_train);
|
2054
|
+
LLAMA_LOG_INFO("%s: model type = %s\n", __func__, llama_model_type_name(model.type));
|
2055
|
+
LLAMA_LOG_INFO("%s: model ftype = %s\n", __func__, llama_model_ftype_name(model.ftype).c_str());
|
2056
|
+
LLAMA_LOG_INFO("%s: model params = %.2f B\n", __func__, ml.n_elements*1e-9);
|
2057
|
+
if (ml.n_bytes < GB) {
|
2058
|
+
LLAMA_LOG_INFO("%s: model size = %.2f MiB (%.2f BPW) \n", __func__, ml.n_bytes/1024.0/1024.0, ml.n_bytes*8.0/ml.n_elements);
|
2059
|
+
} else {
|
2060
|
+
LLAMA_LOG_INFO("%s: model size = %.2f GiB (%.2f BPW) \n", __func__, ml.n_bytes/1024.0/1024.0/1024.0, ml.n_bytes*8.0/ml.n_elements);
|
2061
|
+
}
|
1864
2062
|
|
1865
2063
|
// general kv
|
1866
2064
|
LLAMA_LOG_INFO("%s: general.name = %s\n", __func__, model.name.c_str());
|
@@ -1877,13 +2075,9 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
|
|
1877
2075
|
static void llm_load_tensors(
|
1878
2076
|
llama_model_loader & ml,
|
1879
2077
|
llama_model & model,
|
1880
|
-
int n_batch,
|
1881
2078
|
int n_gpu_layers,
|
1882
2079
|
int main_gpu,
|
1883
2080
|
const float * tensor_split,
|
1884
|
-
const bool mul_mat_q,
|
1885
|
-
bool low_vram,
|
1886
|
-
ggml_type memory_type,
|
1887
2081
|
bool use_mlock,
|
1888
2082
|
llama_progress_callback progress_callback,
|
1889
2083
|
void * progress_callback_user_data) {
|
@@ -1922,11 +2116,9 @@ static void llm_load_tensors(
|
|
1922
2116
|
}
|
1923
2117
|
|
1924
2118
|
(void) main_gpu;
|
1925
|
-
|
1926
|
-
#if defined(GGML_USE_CUBLAS)
|
2119
|
+
#ifdef GGML_USE_CUBLAS
|
1927
2120
|
LLAMA_LOG_INFO("%s: using " GGML_CUDA_NAME " for GPU acceleration\n", __func__);
|
1928
2121
|
ggml_cuda_set_main_device(main_gpu);
|
1929
|
-
ggml_cuda_set_mul_mat_q(mul_mat_q);
|
1930
2122
|
#define LLAMA_BACKEND_OFFLOAD GGML_BACKEND_GPU
|
1931
2123
|
#define LLAMA_BACKEND_OFFLOAD_SPLIT GGML_BACKEND_GPU_SPLIT
|
1932
2124
|
#elif defined(GGML_USE_CLBLAST)
|
@@ -1961,9 +2153,9 @@ static void llm_load_tensors(
|
|
1961
2153
|
// norm is not performance relevant on its own but keeping it in VRAM reduces data copying
|
1962
2154
|
// on Windows however this is detrimental unless everything is on the GPU
|
1963
2155
|
#ifndef _WIN32
|
1964
|
-
backend_norm =
|
2156
|
+
backend_norm = LLAMA_BACKEND_OFFLOAD;
|
1965
2157
|
#else
|
1966
|
-
backend_norm =
|
2158
|
+
backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
|
1967
2159
|
#endif // _WIN32
|
1968
2160
|
|
1969
2161
|
backend_output = LLAMA_BACKEND_OFFLOAD_SPLIT;
|
@@ -2027,9 +2219,9 @@ static void llm_load_tensors(
|
|
2027
2219
|
// norm is not performance relevant on its own but keeping it in VRAM reduces data copying
|
2028
2220
|
// on Windows however this is detrimental unless everything is on the GPU
|
2029
2221
|
#ifndef _WIN32
|
2030
|
-
backend_norm =
|
2222
|
+
backend_norm = LLAMA_BACKEND_OFFLOAD;
|
2031
2223
|
#else
|
2032
|
-
backend_norm =
|
2224
|
+
backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
|
2033
2225
|
#endif // _WIN32
|
2034
2226
|
|
2035
2227
|
backend_output = LLAMA_BACKEND_OFFLOAD_SPLIT;
|
@@ -2097,9 +2289,9 @@ static void llm_load_tensors(
|
|
2097
2289
|
// norm is not performance relevant on its own but keeping it in VRAM reduces data copying
|
2098
2290
|
// on Windows however this is detrimental unless everything is on the GPU
|
2099
2291
|
#ifndef _WIN32
|
2100
|
-
backend_norm =
|
2292
|
+
backend_norm = LLAMA_BACKEND_OFFLOAD;
|
2101
2293
|
#else
|
2102
|
-
backend_norm =
|
2294
|
+
backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
|
2103
2295
|
#endif // _WIN32
|
2104
2296
|
|
2105
2297
|
backend_output = LLAMA_BACKEND_OFFLOAD_SPLIT;
|
@@ -2160,29 +2352,100 @@ static void llm_load_tensors(
|
|
2160
2352
|
}
|
2161
2353
|
}
|
2162
2354
|
} break;
|
2355
|
+
case LLM_ARCH_STARCODER:
|
2356
|
+
{
|
2357
|
+
model.tok_embeddings = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
|
2358
|
+
model.pos_embeddings = ml.create_tensor(ctx, tn(LLM_TENSOR_POS_EMBD, "weight"), {n_embd, hparams.n_ctx_train}, GGML_BACKEND_CPU);
|
2359
|
+
|
2360
|
+
// output
|
2361
|
+
{
|
2362
|
+
ggml_backend backend_norm;
|
2363
|
+
ggml_backend backend_output;
|
2364
|
+
|
2365
|
+
if (n_gpu_layers > int(n_layer)) {
|
2366
|
+
// norm is not performance relevant on its own but keeping it in VRAM reduces data copying
|
2367
|
+
// on Windows however this is detrimental unless everything is on the GPU
|
2368
|
+
#ifndef _WIN32
|
2369
|
+
backend_norm = LLAMA_BACKEND_OFFLOAD;
|
2370
|
+
#else
|
2371
|
+
backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
|
2372
|
+
#endif // _WIN32
|
2373
|
+
|
2374
|
+
backend_output = LLAMA_BACKEND_OFFLOAD_SPLIT;
|
2375
|
+
} else {
|
2376
|
+
backend_norm = GGML_BACKEND_CPU;
|
2377
|
+
backend_output = GGML_BACKEND_CPU;
|
2378
|
+
}
|
2379
|
+
|
2380
|
+
model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm);
|
2381
|
+
model.output_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, backend_norm);
|
2382
|
+
model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
|
2383
|
+
|
2384
|
+
if (backend_norm == GGML_BACKEND_GPU) {
|
2385
|
+
vram_weights += ggml_nbytes(model.output_norm);
|
2386
|
+
vram_weights += ggml_nbytes(model.output_norm_b);
|
2387
|
+
}
|
2388
|
+
if (backend_output == GGML_BACKEND_GPU_SPLIT) {
|
2389
|
+
vram_weights += ggml_nbytes(model.output);
|
2390
|
+
}
|
2391
|
+
}
|
2392
|
+
|
2393
|
+
const uint32_t n_ff = hparams.n_ff;
|
2394
|
+
|
2395
|
+
const int i_gpu_start = n_layer - n_gpu_layers;
|
2396
|
+
|
2397
|
+
model.layers.resize(n_layer);
|
2398
|
+
|
2399
|
+
for (uint32_t i = 0; i < n_layer; ++i) {
|
2400
|
+
const ggml_backend backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT
|
2401
|
+
const ggml_backend backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT
|
2402
|
+
|
2403
|
+
auto & layer = model.layers[i];
|
2404
|
+
|
2405
|
+
layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend);
|
2406
|
+
layer.attn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, backend);
|
2407
|
+
|
2408
|
+
layer.wqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, backend_split);
|
2409
|
+
layer.bqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, backend_split);
|
2410
|
+
|
2411
|
+
layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split);
|
2412
|
+
layer.bo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, backend_split);
|
2413
|
+
|
2414
|
+
layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend);
|
2415
|
+
layer.ffn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, backend);
|
2416
|
+
|
2417
|
+
layer.w2 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, backend_split);
|
2418
|
+
layer.b2 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, backend_split);
|
2419
|
+
|
2420
|
+
layer.w3 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
|
2421
|
+
layer.b3 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, backend_split);
|
2422
|
+
|
2423
|
+
if (backend == GGML_BACKEND_GPU) {
|
2424
|
+
vram_weights +=
|
2425
|
+
ggml_nbytes(layer.attn_norm) + ggml_nbytes(layer.attn_norm_b) +
|
2426
|
+
ggml_nbytes(layer.wqkv) + ggml_nbytes(layer.bqkv) +
|
2427
|
+
ggml_nbytes(layer.wo) + ggml_nbytes(layer.bo) +
|
2428
|
+
ggml_nbytes(layer.ffn_norm) + ggml_nbytes(layer.ffn_norm_b) +
|
2429
|
+
ggml_nbytes(layer.w2) + ggml_nbytes(layer.b2) +
|
2430
|
+
ggml_nbytes(layer.w3) + ggml_nbytes(layer.b3);
|
2431
|
+
}
|
2432
|
+
}
|
2433
|
+
} break;
|
2163
2434
|
default:
|
2164
2435
|
throw std::runtime_error("unknown architecture");
|
2165
|
-
}
|
2436
|
+
}
|
2166
2437
|
}
|
2167
2438
|
|
2168
2439
|
ml.done_getting_tensors();
|
2169
2440
|
|
2170
2441
|
// print memory requirements
|
2171
2442
|
{
|
2172
|
-
const size_t scale = memory_type == GGML_TYPE_F32 ? 2 : 1;
|
2173
|
-
|
2174
2443
|
// this is the total memory required to run the inference
|
2175
2444
|
size_t mem_required =
|
2176
2445
|
ctx_size +
|
2177
2446
|
mmapped_size - vram_weights; // weights in VRAM not in memory
|
2178
2447
|
|
2179
|
-
|
2180
|
-
const size_t mem_required_state = scale*hparams.kv_size();
|
2181
|
-
|
2182
|
-
LLAMA_LOG_INFO("%s: mem required = %7.2f MB (+ %7.2f MB per state)\n", __func__,
|
2183
|
-
mem_required / 1024.0 / 1024.0, mem_required_state / 1024.0 / 1024.0);
|
2184
|
-
|
2185
|
-
(void) n_batch;
|
2448
|
+
LLAMA_LOG_INFO("%s: mem required = %7.2f MB\n", __func__, mem_required / 1024.0 / 1024.0);
|
2186
2449
|
|
2187
2450
|
#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
|
2188
2451
|
const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
|
@@ -2191,36 +2454,17 @@ static void llm_load_tensors(
|
|
2191
2454
|
if (n_gpu_layers > (int) hparams.n_layer) {
|
2192
2455
|
LLAMA_LOG_INFO("%s: offloading non-repeating layers to GPU\n", __func__);
|
2193
2456
|
}
|
2194
|
-
size_t vram_kv_cache = 0;
|
2195
2457
|
|
2196
2458
|
#ifdef GGML_USE_CUBLAS
|
2197
2459
|
const int max_backend_supported_layers = hparams.n_layer + 3;
|
2198
|
-
const int max_offloadable_layers =
|
2199
|
-
if (n_gpu_layers > (int) hparams.n_layer + 1) {
|
2200
|
-
if (low_vram) {
|
2201
|
-
LLAMA_LOG_INFO("%s: cannot offload v cache to GPU due to low VRAM option\n", __func__);
|
2202
|
-
} else {
|
2203
|
-
LLAMA_LOG_INFO("%s: offloading v cache to GPU\n", __func__);
|
2204
|
-
vram_kv_cache += hparams.kv_size() / 2;
|
2205
|
-
}
|
2206
|
-
}
|
2207
|
-
if (n_gpu_layers > (int) hparams.n_layer + 2) {
|
2208
|
-
if (low_vram) {
|
2209
|
-
LLAMA_LOG_WARN("%s: cannot offload k cache to GPU due to low VRAM option\n", __func__);
|
2210
|
-
} else {
|
2211
|
-
LLAMA_LOG_INFO("%s: offloading k cache to GPU\n", __func__);
|
2212
|
-
vram_kv_cache += hparams.kv_size() / 2;
|
2213
|
-
}
|
2214
|
-
}
|
2460
|
+
const int max_offloadable_layers = hparams.n_layer + 3;
|
2215
2461
|
#elif defined(GGML_USE_CLBLAST)
|
2216
2462
|
const int max_backend_supported_layers = hparams.n_layer + 1;
|
2217
2463
|
const int max_offloadable_layers = hparams.n_layer + 1;
|
2218
2464
|
#endif // GGML_USE_CUBLAS
|
2219
2465
|
|
2220
|
-
LLAMA_LOG_INFO("%s: offloaded %d/%d layers to GPU\n",
|
2221
|
-
|
2222
|
-
LLAMA_LOG_INFO("%s: VRAM used: %zu MB\n",
|
2223
|
-
__func__, (vram_weights + vram_kv_cache + MB - 1) / MB); // round up
|
2466
|
+
LLAMA_LOG_INFO("%s: offloaded %d/%d layers to GPU\n", __func__, std::min(n_gpu_layers, max_offloadable_layers), max_backend_supported_layers);
|
2467
|
+
LLAMA_LOG_INFO("%s: VRAM used: %.2f MB\n", __func__, vram_weights / 1024.0 / 1024.0);
|
2224
2468
|
#else
|
2225
2469
|
(void) n_gpu_layers;
|
2226
2470
|
#endif // defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
|
@@ -2233,7 +2477,7 @@ static void llm_load_tensors(
|
|
2233
2477
|
}
|
2234
2478
|
|
2235
2479
|
(void) tensor_split;
|
2236
|
-
#
|
2480
|
+
#ifdef GGML_USE_CUBLAS
|
2237
2481
|
{
|
2238
2482
|
ggml_cuda_set_tensor_split(tensor_split);
|
2239
2483
|
}
|
@@ -2255,29 +2499,24 @@ static void llm_load_tensors(
|
|
2255
2499
|
static bool llama_model_load(
|
2256
2500
|
const std::string & fname,
|
2257
2501
|
llama_model & model,
|
2258
|
-
int n_ctx,
|
2259
|
-
int n_batch,
|
2260
2502
|
int n_gpu_layers,
|
2261
2503
|
int main_gpu,
|
2262
2504
|
const float * tensor_split,
|
2263
|
-
const bool mul_mat_q,
|
2264
|
-
float rope_freq_base,
|
2265
|
-
float rope_freq_scale,
|
2266
|
-
bool low_vram,
|
2267
|
-
ggml_type memory_type,
|
2268
2505
|
bool use_mmap,
|
2269
2506
|
bool use_mlock,
|
2270
2507
|
bool vocab_only,
|
2271
2508
|
llama_progress_callback progress_callback,
|
2272
2509
|
void *progress_callback_user_data) {
|
2273
2510
|
try {
|
2274
|
-
|
2511
|
+
llama_model_loader ml(fname, use_mmap);
|
2512
|
+
|
2513
|
+
model.hparams.vocab_only = vocab_only;
|
2275
2514
|
|
2276
|
-
llm_load_arch (
|
2277
|
-
llm_load_hparams(
|
2278
|
-
llm_load_vocab (
|
2515
|
+
llm_load_arch (ml, model);
|
2516
|
+
llm_load_hparams(ml, model);
|
2517
|
+
llm_load_vocab (ml, model);
|
2279
2518
|
|
2280
|
-
llm_load_print_meta(
|
2519
|
+
llm_load_print_meta(ml, model);
|
2281
2520
|
|
2282
2521
|
if (model.hparams.n_vocab != model.vocab.id_to_token.size()) {
|
2283
2522
|
throw std::runtime_error("vocab size mismatch");
|
@@ -2289,8 +2528,8 @@ static bool llama_model_load(
|
|
2289
2528
|
}
|
2290
2529
|
|
2291
2530
|
llm_load_tensors(
|
2292
|
-
|
2293
|
-
main_gpu, tensor_split,
|
2531
|
+
ml, model, n_gpu_layers,
|
2532
|
+
main_gpu, tensor_split,
|
2294
2533
|
use_mlock, progress_callback, progress_callback_user_data);
|
2295
2534
|
} catch (const std::exception & err) {
|
2296
2535
|
LLAMA_LOG_ERROR("error loading model: %s\n", err.what());
|
@@ -2302,17 +2541,10 @@ static bool llama_model_load(
|
|
2302
2541
|
|
2303
2542
|
static struct ggml_cgraph * llm_build_llama(
|
2304
2543
|
llama_context & lctx,
|
2305
|
-
const
|
2306
|
-
const float * embd,
|
2307
|
-
int n_tokens,
|
2308
|
-
int n_past) {
|
2309
|
-
|
2310
|
-
GGML_ASSERT((!tokens && embd) || (tokens && !embd)); // NOLINT
|
2311
|
-
|
2312
|
-
const int N = n_tokens;
|
2313
|
-
|
2544
|
+
const llama_batch & batch) {
|
2314
2545
|
const auto & model = lctx.model;
|
2315
2546
|
const auto & hparams = model.hparams;
|
2547
|
+
const auto & cparams = lctx.cparams;
|
2316
2548
|
|
2317
2549
|
const auto & kv_self = lctx.kv_self;
|
2318
2550
|
|
@@ -2320,7 +2552,7 @@ static struct ggml_cgraph * llm_build_llama(
|
|
2320
2552
|
|
2321
2553
|
const int64_t n_embd = hparams.n_embd;
|
2322
2554
|
const int64_t n_layer = hparams.n_layer;
|
2323
|
-
const int64_t n_ctx =
|
2555
|
+
const int64_t n_ctx = cparams.n_ctx;
|
2324
2556
|
const int64_t n_head = hparams.n_head;
|
2325
2557
|
const int64_t n_head_kv = hparams.n_head_kv;
|
2326
2558
|
const int64_t n_embd_head = hparams.n_embd_head();
|
@@ -2328,12 +2560,20 @@ static struct ggml_cgraph * llm_build_llama(
|
|
2328
2560
|
|
2329
2561
|
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
2330
2562
|
|
2331
|
-
const float freq_base =
|
2332
|
-
const float freq_scale =
|
2563
|
+
const float freq_base = cparams.rope_freq_base;
|
2564
|
+
const float freq_scale = cparams.rope_freq_scale;
|
2333
2565
|
const float norm_rms_eps = hparams.f_norm_rms_eps;
|
2334
2566
|
|
2335
2567
|
const int n_gpu_layers = model.n_gpu_layers;
|
2336
2568
|
|
2569
|
+
const int32_t n_tokens = batch.n_tokens;
|
2570
|
+
const int32_t n_kv = ggml_allocr_is_measure(lctx.alloc) ? n_ctx : kv_self.n;
|
2571
|
+
const int32_t kv_head = ggml_allocr_is_measure(lctx.alloc) ? n_ctx - n_tokens : kv_self.head;
|
2572
|
+
|
2573
|
+
const bool do_rope_shift = ggml_allocr_is_measure(lctx.alloc) || kv_self.has_shift;
|
2574
|
+
|
2575
|
+
//printf("n_kv = %d\n", n_kv);
|
2576
|
+
|
2337
2577
|
auto & buf_compute = lctx.buf_compute;
|
2338
2578
|
|
2339
2579
|
struct ggml_init_params params = {
|
@@ -2351,12 +2591,12 @@ static struct ggml_cgraph * llm_build_llama(
|
|
2351
2591
|
struct ggml_tensor * cur;
|
2352
2592
|
struct ggml_tensor * inpL;
|
2353
2593
|
|
2354
|
-
if (
|
2355
|
-
struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32,
|
2594
|
+
if (batch.token) {
|
2595
|
+
struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
|
2356
2596
|
|
2357
2597
|
ggml_allocr_alloc(lctx.alloc, inp_tokens);
|
2358
2598
|
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
2359
|
-
memcpy(inp_tokens->data,
|
2599
|
+
memcpy(inp_tokens->data, batch.token, n_tokens*ggml_element_size(inp_tokens));
|
2360
2600
|
}
|
2361
2601
|
ggml_set_name(inp_tokens, "inp_tokens");
|
2362
2602
|
|
@@ -2366,11 +2606,11 @@ static struct ggml_cgraph * llm_build_llama(
|
|
2366
2606
|
GGML_ASSERT(false && "not implemented");
|
2367
2607
|
#endif
|
2368
2608
|
|
2369
|
-
inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd,
|
2609
|
+
inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_tokens);
|
2370
2610
|
|
2371
2611
|
ggml_allocr_alloc(lctx.alloc, inpL);
|
2372
2612
|
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
2373
|
-
memcpy(inpL->data, embd,
|
2613
|
+
memcpy(inpL->data, batch.embd, n_tokens * n_embd * ggml_element_size(inpL));
|
2374
2614
|
}
|
2375
2615
|
}
|
2376
2616
|
|
@@ -2379,9 +2619,6 @@ static struct ggml_cgraph * llm_build_llama(
|
|
2379
2619
|
|
2380
2620
|
// offload functions set the tensor output backend to GPU
|
2381
2621
|
// tensors are GPU-accelerated if any input or the output has been offloaded
|
2382
|
-
//
|
2383
|
-
// with the low VRAM option VRAM scratch is disabled in llama_load_model_internal
|
2384
|
-
// in that case ggml_cuda_assign_buffers has no effect
|
2385
2622
|
offload_func_t offload_func_nr = llama_nop; // nr = non-repeating
|
2386
2623
|
offload_func_t offload_func_kq = llama_nop;
|
2387
2624
|
offload_func_t offload_func_v = llama_nop;
|
@@ -2398,12 +2635,75 @@ static struct ggml_cgraph * llm_build_llama(
|
|
2398
2635
|
}
|
2399
2636
|
#endif // GGML_USE_CUBLAS
|
2400
2637
|
|
2638
|
+
// KQ_scale
|
2401
2639
|
struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
|
2640
|
+
ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)");
|
2402
2641
|
ggml_allocr_alloc(lctx.alloc, KQ_scale);
|
2403
2642
|
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
2404
|
-
ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(
|
2643
|
+
ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd_head)));
|
2644
|
+
}
|
2645
|
+
|
2646
|
+
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
2647
|
+
struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
|
2648
|
+
offload_func_kq(KQ_mask);
|
2649
|
+
ggml_set_name(KQ_mask, "KQ_mask");
|
2650
|
+
ggml_allocr_alloc(lctx.alloc, KQ_mask);
|
2651
|
+
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
2652
|
+
float * data = (float *) KQ_mask->data;
|
2653
|
+
memset(data, 0, ggml_nbytes(KQ_mask));
|
2654
|
+
|
2655
|
+
for (int h = 0; h < 1; ++h) {
|
2656
|
+
for (int j = 0; j < n_tokens; ++j) {
|
2657
|
+
const llama_pos pos = batch.pos[j];
|
2658
|
+
const llama_seq_id seq_id = batch.seq_id[j];
|
2659
|
+
|
2660
|
+
for (int i = 0; i < n_kv; ++i) {
|
2661
|
+
if (!kv_self.cells[i].has_seq_id(seq_id) || kv_self.cells[i].pos > pos) {
|
2662
|
+
data[h*(n_kv*n_tokens) + j*n_kv + i] = -INFINITY;
|
2663
|
+
}
|
2664
|
+
}
|
2665
|
+
}
|
2666
|
+
}
|
2667
|
+
}
|
2668
|
+
|
2669
|
+
// KQ_pos - contains the positions
|
2670
|
+
struct ggml_tensor * KQ_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
|
2671
|
+
offload_func_kq(KQ_pos);
|
2672
|
+
ggml_set_name(KQ_pos, "KQ_pos");
|
2673
|
+
ggml_allocr_alloc(lctx.alloc, KQ_pos);
|
2674
|
+
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
2675
|
+
int * data = (int *) KQ_pos->data;
|
2676
|
+
for (int i = 0; i < n_tokens; ++i) {
|
2677
|
+
data[i] = batch.pos[i];
|
2678
|
+
}
|
2679
|
+
}
|
2680
|
+
|
2681
|
+
// shift the entire K-cache if needed
|
2682
|
+
if (do_rope_shift) {
|
2683
|
+
struct ggml_tensor * K_shift = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_ctx);
|
2684
|
+
offload_func_kq(K_shift);
|
2685
|
+
ggml_set_name(K_shift, "K_shift");
|
2686
|
+
ggml_allocr_alloc(lctx.alloc, K_shift);
|
2687
|
+
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
2688
|
+
int * data = (int *) K_shift->data;
|
2689
|
+
for (int i = 0; i < n_ctx; ++i) {
|
2690
|
+
data[i] = kv_self.cells[i].delta;
|
2691
|
+
}
|
2692
|
+
}
|
2693
|
+
|
2694
|
+
for (int il = 0; il < n_layer; ++il) {
|
2695
|
+
struct ggml_tensor * tmp =
|
2696
|
+
ggml_rope_custom_inplace(ctx0,
|
2697
|
+
ggml_view_3d(ctx0, kv_self.k,
|
2698
|
+
n_embd_head, n_head_kv, n_ctx,
|
2699
|
+
ggml_element_size(kv_self.k)*n_embd_head,
|
2700
|
+
ggml_element_size(kv_self.k)*n_embd_gqa,
|
2701
|
+
ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il),
|
2702
|
+
K_shift, n_embd_head, 0, 0, freq_base, freq_scale);
|
2703
|
+
offload_func_kq(tmp);
|
2704
|
+
ggml_build_forward_expand(gf, tmp);
|
2705
|
+
}
|
2405
2706
|
}
|
2406
|
-
ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)");
|
2407
2707
|
|
2408
2708
|
for (int il = 0; il < n_layer; ++il) {
|
2409
2709
|
ggml_format_name(inpL, "layer_inp_%d", il);
|
@@ -2441,33 +2741,33 @@ static struct ggml_cgraph * llm_build_llama(
|
|
2441
2741
|
offload_func_kq(tmpq);
|
2442
2742
|
ggml_set_name(tmpq, "tmpq");
|
2443
2743
|
|
2444
|
-
struct ggml_tensor * Kcur =
|
2744
|
+
struct ggml_tensor * Kcur = ggml_rope_custom(ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd_head, n_head_kv, n_tokens), KQ_pos, n_embd_head, 0, 0, freq_base, freq_scale);
|
2445
2745
|
offload_func_kq(Kcur);
|
2446
2746
|
ggml_set_name(Kcur, "Kcur");
|
2447
2747
|
|
2448
|
-
struct ggml_tensor * Qcur =
|
2748
|
+
struct ggml_tensor * Qcur = ggml_rope_custom(ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head, n_tokens), KQ_pos, n_embd_head, 0, 0, freq_base, freq_scale);
|
2449
2749
|
offload_func_kq(Qcur);
|
2450
2750
|
ggml_set_name(Qcur, "Qcur");
|
2451
2751
|
|
2452
2752
|
// store key and value to memory
|
2453
2753
|
{
|
2454
|
-
// compute the transposed [
|
2754
|
+
// compute the transposed [n_tokens, n_embd] V matrix
|
2455
2755
|
|
2456
2756
|
struct ggml_tensor * tmpv = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
|
2457
2757
|
offload_func_v(tmpv);
|
2458
2758
|
ggml_set_name(tmpv, "tmpv");
|
2459
2759
|
|
2460
|
-
struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, tmpv, n_embd_gqa,
|
2760
|
+
struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, tmpv, n_embd_gqa, n_tokens));
|
2461
2761
|
offload_func_v(Vcur);
|
2462
2762
|
ggml_set_name(Vcur, "Vcur");
|
2463
2763
|
|
2464
|
-
struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k,
|
2764
|
+
struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, n_tokens*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + kv_head));
|
2465
2765
|
offload_func_kq(k);
|
2466
2766
|
ggml_set_name(k, "k");
|
2467
2767
|
|
2468
|
-
struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v,
|
2768
|
+
struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, n_tokens, n_embd_gqa,
|
2469
2769
|
( n_ctx)*ggml_element_size(kv_self.v),
|
2470
|
-
(il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa +
|
2770
|
+
(il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + kv_head*ggml_element_size(kv_self.v));
|
2471
2771
|
offload_func_v(v);
|
2472
2772
|
ggml_set_name(v, "v");
|
2473
2773
|
|
@@ -2482,7 +2782,7 @@ static struct ggml_cgraph * llm_build_llama(
|
|
2482
2782
|
|
2483
2783
|
struct ggml_tensor * K =
|
2484
2784
|
ggml_view_3d(ctx0, kv_self.k,
|
2485
|
-
n_embd_head,
|
2785
|
+
n_embd_head, n_kv, n_head_kv,
|
2486
2786
|
ggml_element_size(kv_self.k)*n_embd_gqa,
|
2487
2787
|
ggml_element_size(kv_self.k)*n_embd_head,
|
2488
2788
|
ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il);
|
@@ -2495,25 +2795,25 @@ static struct ggml_cgraph * llm_build_llama(
|
|
2495
2795
|
ggml_set_name(KQ, "KQ");
|
2496
2796
|
|
2497
2797
|
// KQ_scaled = KQ / sqrt(n_embd_head)
|
2498
|
-
// KQ_scaled shape [
|
2499
|
-
struct ggml_tensor * KQ_scaled =
|
2798
|
+
// KQ_scaled shape [n_kv, n_tokens, n_head, 1]
|
2799
|
+
struct ggml_tensor * KQ_scaled = ggml_scale(ctx0, KQ, KQ_scale);
|
2500
2800
|
offload_func_kq(KQ_scaled);
|
2501
2801
|
ggml_set_name(KQ_scaled, "KQ_scaled");
|
2502
2802
|
|
2503
2803
|
// KQ_masked = mask_past(KQ_scaled)
|
2504
|
-
struct ggml_tensor * KQ_masked =
|
2804
|
+
struct ggml_tensor * KQ_masked = ggml_add(ctx0, KQ_scaled, KQ_mask);
|
2505
2805
|
offload_func_kq(KQ_masked);
|
2506
2806
|
ggml_set_name(KQ_masked, "KQ_masked");
|
2507
2807
|
|
2508
2808
|
// KQ = soft_max(KQ_masked)
|
2509
|
-
struct ggml_tensor * KQ_soft_max =
|
2809
|
+
struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked);
|
2510
2810
|
offload_func_v(KQ_soft_max);
|
2511
2811
|
ggml_set_name(KQ_soft_max, "KQ_soft_max");
|
2512
2812
|
|
2513
2813
|
// split cached V into n_head heads
|
2514
2814
|
struct ggml_tensor * V =
|
2515
2815
|
ggml_view_3d(ctx0, kv_self.v,
|
2516
|
-
|
2816
|
+
n_kv, n_embd_head, n_head_kv,
|
2517
2817
|
ggml_element_size(kv_self.v)*n_ctx,
|
2518
2818
|
ggml_element_size(kv_self.v)*n_ctx*n_embd_head,
|
2519
2819
|
ggml_element_size(kv_self.v)*n_ctx*n_embd_gqa*il);
|
@@ -2528,7 +2828,7 @@ static struct ggml_cgraph * llm_build_llama(
|
|
2528
2828
|
// make V contiguous in memory to speed up the matmul, however we waste time on the copy
|
2529
2829
|
// on M1 this is faster for the perplexity computation, but ~5% slower for the single-token generation
|
2530
2830
|
// is there a better way?
|
2531
|
-
struct ggml_tensor * V_cont = ggml_cpy(ctx0, V, ggml_new_tensor_3d(ctx0, kv_self.v->type,
|
2831
|
+
struct ggml_tensor * V_cont = ggml_cpy(ctx0, V, ggml_new_tensor_3d(ctx0, kv_self.v->type, n_ctx, n_embd_head, n_head));
|
2532
2832
|
struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V_cont, KQ_soft_max);
|
2533
2833
|
#endif
|
2534
2834
|
|
@@ -2537,10 +2837,8 @@ static struct ggml_cgraph * llm_build_llama(
|
|
2537
2837
|
offload_func_v(KQV_merged);
|
2538
2838
|
ggml_set_name(KQV_merged, "KQV_merged");
|
2539
2839
|
|
2540
|
-
// cur = KQV_merged.contiguous().view(n_embd,
|
2541
|
-
cur =
|
2542
|
-
KQV_merged,
|
2543
|
-
ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N));
|
2840
|
+
// cur = KQV_merged.contiguous().view(n_embd, n_tokens)
|
2841
|
+
cur = ggml_cont_2d(ctx0, KQV_merged, n_embd, n_tokens);
|
2544
2842
|
offload_func_v(cur);
|
2545
2843
|
ggml_set_name(cur, "KQV_merged_contiguous");
|
2546
2844
|
|
@@ -2631,20 +2929,12 @@ static struct ggml_cgraph * llm_build_llama(
|
|
2631
2929
|
return gf;
|
2632
2930
|
}
|
2633
2931
|
|
2634
|
-
|
2635
2932
|
static struct ggml_cgraph * llm_build_baichaun(
|
2636
2933
|
llama_context & lctx,
|
2637
|
-
const
|
2638
|
-
const float * embd,
|
2639
|
-
int n_tokens,
|
2640
|
-
int n_past) {
|
2641
|
-
|
2642
|
-
GGML_ASSERT((!tokens && embd) || (tokens && !embd)); // NOLINT
|
2643
|
-
|
2644
|
-
const int N = n_tokens;
|
2645
|
-
|
2934
|
+
const llama_batch & batch) {
|
2646
2935
|
const auto & model = lctx.model;
|
2647
2936
|
const auto & hparams = model.hparams;
|
2937
|
+
const auto & cparams = lctx.cparams;
|
2648
2938
|
|
2649
2939
|
const auto & kv_self = lctx.kv_self;
|
2650
2940
|
|
@@ -2652,7 +2942,7 @@ static struct ggml_cgraph * llm_build_baichaun(
|
|
2652
2942
|
|
2653
2943
|
const int64_t n_embd = hparams.n_embd;
|
2654
2944
|
const int64_t n_layer = hparams.n_layer;
|
2655
|
-
const int64_t n_ctx =
|
2945
|
+
const int64_t n_ctx = cparams.n_ctx;
|
2656
2946
|
const int64_t n_head = hparams.n_head;
|
2657
2947
|
const int64_t n_head_kv = hparams.n_head_kv;
|
2658
2948
|
const int64_t n_embd_head = hparams.n_embd_head();
|
@@ -2660,12 +2950,18 @@ static struct ggml_cgraph * llm_build_baichaun(
|
|
2660
2950
|
|
2661
2951
|
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
2662
2952
|
|
2663
|
-
const float freq_base =
|
2664
|
-
const float freq_scale =
|
2953
|
+
const float freq_base = cparams.rope_freq_base;
|
2954
|
+
const float freq_scale = cparams.rope_freq_scale;
|
2665
2955
|
const float norm_rms_eps = hparams.f_norm_rms_eps;
|
2666
2956
|
|
2667
2957
|
const int n_gpu_layers = model.n_gpu_layers;
|
2668
2958
|
|
2959
|
+
const int32_t n_tokens = batch.n_tokens;
|
2960
|
+
const int32_t n_kv = ggml_allocr_is_measure(lctx.alloc) ? n_ctx : kv_self.n;
|
2961
|
+
const int32_t kv_head = ggml_allocr_is_measure(lctx.alloc) ? n_ctx - n_tokens : kv_self.head;
|
2962
|
+
|
2963
|
+
const bool do_rope_shift = ggml_allocr_is_measure(lctx.alloc) || kv_self.has_shift;
|
2964
|
+
|
2669
2965
|
auto & buf_compute = lctx.buf_compute;
|
2670
2966
|
|
2671
2967
|
struct ggml_init_params params = {
|
@@ -2683,12 +2979,12 @@ static struct ggml_cgraph * llm_build_baichaun(
|
|
2683
2979
|
struct ggml_tensor * cur;
|
2684
2980
|
struct ggml_tensor * inpL;
|
2685
2981
|
|
2686
|
-
if (
|
2687
|
-
struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32,
|
2982
|
+
if (batch.token) {
|
2983
|
+
struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
|
2688
2984
|
|
2689
2985
|
ggml_allocr_alloc(lctx.alloc, inp_tokens);
|
2690
2986
|
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
2691
|
-
memcpy(inp_tokens->data,
|
2987
|
+
memcpy(inp_tokens->data, batch.token, n_tokens*ggml_element_size(inp_tokens));
|
2692
2988
|
}
|
2693
2989
|
ggml_set_name(inp_tokens, "inp_tokens");
|
2694
2990
|
|
@@ -2698,11 +2994,11 @@ static struct ggml_cgraph * llm_build_baichaun(
|
|
2698
2994
|
GGML_ASSERT(false && "not implemented");
|
2699
2995
|
#endif
|
2700
2996
|
|
2701
|
-
inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd,
|
2997
|
+
inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_tokens);
|
2702
2998
|
|
2703
2999
|
ggml_allocr_alloc(lctx.alloc, inpL);
|
2704
3000
|
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
2705
|
-
memcpy(inpL->data, embd,
|
3001
|
+
memcpy(inpL->data, batch.embd, n_tokens * n_embd * ggml_element_size(inpL));
|
2706
3002
|
}
|
2707
3003
|
}
|
2708
3004
|
|
@@ -2711,9 +3007,6 @@ static struct ggml_cgraph * llm_build_baichaun(
|
|
2711
3007
|
|
2712
3008
|
// offload functions set the tensor output backend to GPU
|
2713
3009
|
// tensors are GPU-accelerated if any input or the output has been offloaded
|
2714
|
-
//
|
2715
|
-
// with the low VRAM option VRAM scratch is disabled in llama_load_model_internal
|
2716
|
-
// in that case ggml_cuda_assign_buffers has no effect
|
2717
3010
|
offload_func_t offload_func_nr = llama_nop; // nr = non-repeating
|
2718
3011
|
offload_func_t offload_func_kq = llama_nop;
|
2719
3012
|
offload_func_t offload_func_v = llama_nop;
|
@@ -2730,12 +3023,75 @@ static struct ggml_cgraph * llm_build_baichaun(
|
|
2730
3023
|
}
|
2731
3024
|
#endif // GGML_USE_CUBLAS
|
2732
3025
|
|
3026
|
+
// KQ_scale
|
2733
3027
|
struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
|
3028
|
+
ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)");
|
2734
3029
|
ggml_allocr_alloc(lctx.alloc, KQ_scale);
|
2735
3030
|
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
2736
3031
|
ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd)/n_head));
|
2737
3032
|
}
|
2738
|
-
|
3033
|
+
|
3034
|
+
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
3035
|
+
struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
|
3036
|
+
offload_func_kq(KQ_mask);
|
3037
|
+
ggml_set_name(KQ_mask, "KQ_mask");
|
3038
|
+
ggml_allocr_alloc(lctx.alloc, KQ_mask);
|
3039
|
+
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
3040
|
+
float * data = (float *) KQ_mask->data;
|
3041
|
+
memset(data, 0, ggml_nbytes(KQ_mask));
|
3042
|
+
|
3043
|
+
for (int h = 0; h < 1; ++h) {
|
3044
|
+
for (int j = 0; j < n_tokens; ++j) {
|
3045
|
+
const llama_pos pos = batch.pos[j];
|
3046
|
+
const llama_seq_id seq_id = batch.seq_id[j];
|
3047
|
+
|
3048
|
+
for (int i = 0; i < n_kv; ++i) {
|
3049
|
+
if (!kv_self.cells[i].has_seq_id(seq_id) || kv_self.cells[i].pos > pos) {
|
3050
|
+
data[h*(n_kv*n_tokens) + j*n_kv + i] = -INFINITY;
|
3051
|
+
}
|
3052
|
+
}
|
3053
|
+
}
|
3054
|
+
}
|
3055
|
+
}
|
3056
|
+
|
3057
|
+
// KQ_pos - contains the positions
|
3058
|
+
struct ggml_tensor * KQ_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
|
3059
|
+
offload_func_kq(KQ_pos);
|
3060
|
+
ggml_set_name(KQ_pos, "KQ_pos");
|
3061
|
+
ggml_allocr_alloc(lctx.alloc, KQ_pos);
|
3062
|
+
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
3063
|
+
int * data = (int *) KQ_pos->data;
|
3064
|
+
for (int i = 0; i < n_tokens; ++i) {
|
3065
|
+
data[i] = batch.pos[i];
|
3066
|
+
}
|
3067
|
+
}
|
3068
|
+
|
3069
|
+
// shift the entire K-cache if needed
|
3070
|
+
if (do_rope_shift) {
|
3071
|
+
struct ggml_tensor * K_shift = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_ctx);
|
3072
|
+
offload_func_kq(K_shift);
|
3073
|
+
ggml_set_name(K_shift, "K_shift");
|
3074
|
+
ggml_allocr_alloc(lctx.alloc, K_shift);
|
3075
|
+
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
3076
|
+
int * data = (int *) K_shift->data;
|
3077
|
+
for (int i = 0; i < n_ctx; ++i) {
|
3078
|
+
data[i] = kv_self.cells[i].delta;
|
3079
|
+
}
|
3080
|
+
}
|
3081
|
+
|
3082
|
+
for (int il = 0; il < n_layer; ++il) {
|
3083
|
+
struct ggml_tensor * tmp =
|
3084
|
+
ggml_rope_custom_inplace(ctx0,
|
3085
|
+
ggml_view_3d(ctx0, kv_self.k,
|
3086
|
+
n_embd_head, n_head_kv, n_ctx,
|
3087
|
+
ggml_element_size(kv_self.k)*n_embd_head,
|
3088
|
+
ggml_element_size(kv_self.k)*n_embd_gqa,
|
3089
|
+
ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il),
|
3090
|
+
K_shift, n_embd_head, 0, 0, freq_base, freq_scale);
|
3091
|
+
offload_func_kq(tmp);
|
3092
|
+
ggml_build_forward_expand(gf, tmp);
|
3093
|
+
}
|
3094
|
+
}
|
2739
3095
|
|
2740
3096
|
for (int il = 0; il < n_layer; ++il) {
|
2741
3097
|
ggml_format_name(inpL, "layer_inp_%d", il);
|
@@ -2777,12 +3133,12 @@ static struct ggml_cgraph * llm_build_baichaun(
|
|
2777
3133
|
struct ggml_tensor * Qcur;
|
2778
3134
|
switch (model.type) {
|
2779
3135
|
case MODEL_7B:
|
2780
|
-
Kcur =
|
2781
|
-
Qcur =
|
3136
|
+
Kcur = ggml_rope_custom(ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd_head, n_head_kv, n_tokens), KQ_pos, n_embd_head, 0, 0, freq_base, freq_scale);
|
3137
|
+
Qcur = ggml_rope_custom(ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head, n_tokens), KQ_pos, n_embd_head, 0, 0, freq_base, freq_scale);
|
2782
3138
|
break;
|
2783
3139
|
case MODEL_13B:
|
2784
|
-
Kcur
|
2785
|
-
Qcur = ggml_reshape_3d(ctx0, tmpq, n_embd/n_head, n_head,
|
3140
|
+
Kcur = ggml_reshape_3d(ctx0, tmpk, n_embd/n_head, n_head, n_tokens);
|
3141
|
+
Qcur = ggml_reshape_3d(ctx0, tmpq, n_embd/n_head, n_head, n_tokens);
|
2786
3142
|
break;
|
2787
3143
|
default:
|
2788
3144
|
GGML_ASSERT(false);
|
@@ -2796,23 +3152,23 @@ static struct ggml_cgraph * llm_build_baichaun(
|
|
2796
3152
|
|
2797
3153
|
// store key and value to memory
|
2798
3154
|
{
|
2799
|
-
// compute the transposed [
|
3155
|
+
// compute the transposed [n_tokens, n_embd] V matrix
|
2800
3156
|
|
2801
3157
|
struct ggml_tensor * tmpv = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
|
2802
3158
|
offload_func_v(tmpv);
|
2803
3159
|
ggml_set_name(tmpv, "tmpv");
|
2804
3160
|
|
2805
|
-
struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, tmpv, n_embd_gqa,
|
3161
|
+
struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, tmpv, n_embd_gqa, n_tokens));
|
2806
3162
|
offload_func_v(Vcur);
|
2807
3163
|
ggml_set_name(Vcur, "Vcur");
|
2808
3164
|
|
2809
|
-
struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k,
|
3165
|
+
struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, n_tokens*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + kv_head));
|
2810
3166
|
offload_func_kq(k);
|
2811
3167
|
ggml_set_name(k, "k");
|
2812
3168
|
|
2813
|
-
struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v,
|
3169
|
+
struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, n_tokens, n_embd_gqa,
|
2814
3170
|
( n_ctx)*ggml_element_size(kv_self.v),
|
2815
|
-
(il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa +
|
3171
|
+
(il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + kv_head*ggml_element_size(kv_self.v));
|
2816
3172
|
offload_func_v(v);
|
2817
3173
|
ggml_set_name(v, "v");
|
2818
3174
|
|
@@ -2827,7 +3183,7 @@ static struct ggml_cgraph * llm_build_baichaun(
|
|
2827
3183
|
|
2828
3184
|
struct ggml_tensor * K =
|
2829
3185
|
ggml_view_3d(ctx0, kv_self.k,
|
2830
|
-
n_embd_head,
|
3186
|
+
n_embd_head, n_kv, n_head_kv,
|
2831
3187
|
ggml_element_size(kv_self.k)*n_embd_gqa,
|
2832
3188
|
ggml_element_size(kv_self.k)*n_embd_head,
|
2833
3189
|
ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il);
|
@@ -2840,8 +3196,8 @@ static struct ggml_cgraph * llm_build_baichaun(
|
|
2840
3196
|
ggml_set_name(KQ, "KQ");
|
2841
3197
|
|
2842
3198
|
// KQ_scaled = KQ / sqrt(n_embd_head)
|
2843
|
-
// KQ_scaled shape [n_past +
|
2844
|
-
struct ggml_tensor * KQ_scaled =
|
3199
|
+
// KQ_scaled shape [n_past + n_tokens, n_tokens, n_head, 1]
|
3200
|
+
struct ggml_tensor * KQ_scaled = ggml_scale(ctx0, KQ, KQ_scale);
|
2845
3201
|
offload_func_kq(KQ_scaled);
|
2846
3202
|
ggml_set_name(KQ_scaled, "KQ_scaled");
|
2847
3203
|
|
@@ -2850,58 +3206,44 @@ static struct ggml_cgraph * llm_build_baichaun(
|
|
2850
3206
|
|
2851
3207
|
switch (model.type) {
|
2852
3208
|
case MODEL_7B:
|
2853
|
-
KQ_masked =
|
3209
|
+
KQ_masked = ggml_add(ctx0, KQ_scaled, KQ_mask);
|
2854
3210
|
break;
|
2855
3211
|
case MODEL_13B:
|
2856
|
-
|
3212
|
+
// TODO: replace with ggml_add()
|
3213
|
+
KQ_scaled_alibi = ggml_alibi(ctx0, KQ_scaled, /*n_past*/ 0, n_head, 8);
|
2857
3214
|
ggml_set_name(KQ_scaled_alibi, "KQ_scaled_alibi");
|
2858
|
-
KQ_masked =
|
3215
|
+
KQ_masked = ggml_add(ctx0, KQ_scaled_alibi, KQ_mask);
|
2859
3216
|
break;
|
2860
3217
|
default:
|
2861
3218
|
GGML_ASSERT(false);
|
2862
3219
|
}
|
2863
|
-
// KQ_masked = mask_past(KQ_scaled)
|
2864
|
-
// struct ggml_tensor * KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past);
|
2865
|
-
// struct ggml_tensor * KQ_masked = ggml_diag_mask_inf(ctx0, KQ_scaled_alibi, n_past);
|
2866
|
-
// offload_func_kq(KQ_masked);
|
2867
|
-
// ggml_set_name(KQ_masked, "KQ_masked");
|
2868
3220
|
|
2869
3221
|
// KQ = soft_max(KQ_masked)
|
2870
|
-
struct ggml_tensor * KQ_soft_max =
|
3222
|
+
struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked);
|
2871
3223
|
offload_func_v(KQ_soft_max);
|
2872
3224
|
ggml_set_name(KQ_soft_max, "KQ_soft_max");
|
2873
3225
|
|
2874
3226
|
// split cached V into n_head heads
|
2875
3227
|
struct ggml_tensor * V =
|
2876
3228
|
ggml_view_3d(ctx0, kv_self.v,
|
2877
|
-
|
3229
|
+
n_kv, n_embd_head, n_head_kv,
|
2878
3230
|
ggml_element_size(kv_self.v)*n_ctx,
|
2879
3231
|
ggml_element_size(kv_self.v)*n_ctx*n_embd_head,
|
2880
3232
|
ggml_element_size(kv_self.v)*n_ctx*n_embd_gqa*il);
|
2881
3233
|
offload_func_v(V);
|
2882
3234
|
ggml_set_name(V, "V");
|
2883
3235
|
|
2884
|
-
#if 1
|
2885
3236
|
struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
|
2886
3237
|
offload_func_v(KQV);
|
2887
3238
|
ggml_set_name(KQV, "KQV");
|
2888
|
-
#else
|
2889
|
-
// make V contiguous in memory to speed up the matmul, however we waste time on the copy
|
2890
|
-
// on M1 this is faster for the perplexity computation, but ~5% slower for the single-token generation
|
2891
|
-
// is there a better way?
|
2892
|
-
struct ggml_tensor * V_cont = ggml_cpy(ctx0, V, ggml_new_tensor_3d(ctx0, kv_self.v->type, n_past + N, n_embd_head, n_head));
|
2893
|
-
struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V_cont, KQ_soft_max);
|
2894
|
-
#endif
|
2895
3239
|
|
2896
3240
|
// KQV_merged = KQV.permute(0, 2, 1, 3)
|
2897
3241
|
struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
|
2898
3242
|
offload_func_v(KQV_merged);
|
2899
3243
|
ggml_set_name(KQV_merged, "KQV_merged");
|
2900
3244
|
|
2901
|
-
// cur = KQV_merged.contiguous().view(n_embd,
|
2902
|
-
cur =
|
2903
|
-
KQV_merged,
|
2904
|
-
ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N));
|
3245
|
+
// cur = KQV_merged.contiguous().view(n_embd, n_tokens)
|
3246
|
+
cur = ggml_cont_2d(ctx0, KQV_merged, n_embd, n_tokens);
|
2905
3247
|
offload_func_v(cur);
|
2906
3248
|
ggml_set_name(cur, "KQV_merged_contiguous");
|
2907
3249
|
|
@@ -2994,17 +3336,10 @@ static struct ggml_cgraph * llm_build_baichaun(
|
|
2994
3336
|
|
2995
3337
|
static struct ggml_cgraph * llm_build_falcon(
|
2996
3338
|
llama_context & lctx,
|
2997
|
-
const
|
2998
|
-
const float * embd,
|
2999
|
-
int n_tokens,
|
3000
|
-
int n_past) {
|
3001
|
-
|
3002
|
-
GGML_ASSERT((!tokens && embd) || (tokens && !embd)); // NOLINT
|
3003
|
-
|
3004
|
-
const int N = n_tokens;
|
3005
|
-
|
3339
|
+
const llama_batch & batch) {
|
3006
3340
|
const auto & model = lctx.model;
|
3007
3341
|
const auto & hparams = model.hparams;
|
3342
|
+
const auto & cparams = lctx.cparams;
|
3008
3343
|
|
3009
3344
|
const auto & kv_self = lctx.kv_self;
|
3010
3345
|
|
@@ -3012,7 +3347,7 @@ static struct ggml_cgraph * llm_build_falcon(
|
|
3012
3347
|
|
3013
3348
|
const int64_t n_embd = hparams.n_embd;
|
3014
3349
|
const int64_t n_layer = hparams.n_layer;
|
3015
|
-
const int64_t n_ctx =
|
3350
|
+
const int64_t n_ctx = cparams.n_ctx;
|
3016
3351
|
const int64_t n_head = hparams.n_head;
|
3017
3352
|
const int64_t n_head_kv = hparams.n_head_kv;
|
3018
3353
|
const int64_t n_embd_head = hparams.n_embd_head();
|
@@ -3020,12 +3355,21 @@ static struct ggml_cgraph * llm_build_falcon(
|
|
3020
3355
|
|
3021
3356
|
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
3022
3357
|
|
3023
|
-
const float freq_base =
|
3024
|
-
const float freq_scale =
|
3358
|
+
const float freq_base = cparams.rope_freq_base;
|
3359
|
+
const float freq_scale = cparams.rope_freq_scale;
|
3025
3360
|
const float norm_eps = hparams.f_norm_eps;
|
3026
3361
|
|
3027
3362
|
const int n_gpu_layers = model.n_gpu_layers;
|
3028
3363
|
|
3364
|
+
const int32_t n_tokens = batch.n_tokens;
|
3365
|
+
const int32_t n_kv = ggml_allocr_is_measure(lctx.alloc) ? n_ctx : kv_self.n;
|
3366
|
+
const int32_t kv_head = ggml_allocr_is_measure(lctx.alloc) ? n_ctx - n_tokens : kv_self.head;
|
3367
|
+
|
3368
|
+
const bool do_rope_shift = ggml_allocr_is_measure(lctx.alloc) || kv_self.has_shift;
|
3369
|
+
|
3370
|
+
//printf("kv_head = %d, n_kv = %d, n_tokens = %d, n_ctx = %d, is_measure = %d, has_shift = %d\n",
|
3371
|
+
// kv_head, n_kv, n_tokens, n_ctx, ggml_allocr_is_measure(lctx.alloc), kv_self.has_shift);
|
3372
|
+
|
3029
3373
|
auto & buf_compute = lctx.buf_compute;
|
3030
3374
|
|
3031
3375
|
struct ggml_init_params params = {
|
@@ -3043,12 +3387,12 @@ static struct ggml_cgraph * llm_build_falcon(
|
|
3043
3387
|
struct ggml_tensor * cur;
|
3044
3388
|
struct ggml_tensor * inpL;
|
3045
3389
|
|
3046
|
-
if (
|
3047
|
-
struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32,
|
3390
|
+
if (batch.token) {
|
3391
|
+
struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
|
3048
3392
|
|
3049
3393
|
ggml_allocr_alloc(lctx.alloc, inp_tokens);
|
3050
3394
|
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
3051
|
-
memcpy(inp_tokens->data,
|
3395
|
+
memcpy(inp_tokens->data, batch.token, n_tokens*ggml_element_size(inp_tokens));
|
3052
3396
|
}
|
3053
3397
|
ggml_set_name(inp_tokens, "inp_tokens");
|
3054
3398
|
|
@@ -3058,11 +3402,11 @@ static struct ggml_cgraph * llm_build_falcon(
|
|
3058
3402
|
GGML_ASSERT(false && "not implemented");
|
3059
3403
|
#endif
|
3060
3404
|
|
3061
|
-
inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd,
|
3405
|
+
inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_tokens);
|
3062
3406
|
|
3063
3407
|
ggml_allocr_alloc(lctx.alloc, inpL);
|
3064
3408
|
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
3065
|
-
memcpy(inpL->data, embd,
|
3409
|
+
memcpy(inpL->data, batch.embd, n_tokens * n_embd * ggml_element_size(inpL));
|
3066
3410
|
}
|
3067
3411
|
}
|
3068
3412
|
|
@@ -3071,9 +3415,6 @@ static struct ggml_cgraph * llm_build_falcon(
|
|
3071
3415
|
|
3072
3416
|
// offload functions set the tensor output backend to GPU
|
3073
3417
|
// tensors are GPU-accelerated if any input or the output has been offloaded
|
3074
|
-
//
|
3075
|
-
// with the low VRAM option VRAM scratch is disabled in llama_load_model_internal
|
3076
|
-
// in that case ggml_cuda_assign_buffers has no effect
|
3077
3418
|
offload_func_t offload_func_nr = llama_nop; // nr = non-repeating
|
3078
3419
|
offload_func_t offload_func_kq = llama_nop;
|
3079
3420
|
offload_func_t offload_func_v = llama_nop;
|
@@ -3090,12 +3431,75 @@ static struct ggml_cgraph * llm_build_falcon(
|
|
3090
3431
|
}
|
3091
3432
|
#endif // GGML_USE_CUBLAS
|
3092
3433
|
|
3434
|
+
// KQ_scale
|
3093
3435
|
struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
|
3436
|
+
ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)");
|
3094
3437
|
ggml_allocr_alloc(lctx.alloc, KQ_scale);
|
3095
3438
|
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
3096
3439
|
ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd)/n_head));
|
3097
3440
|
}
|
3098
|
-
|
3441
|
+
|
3442
|
+
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
3443
|
+
struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
|
3444
|
+
offload_func_kq(KQ_mask);
|
3445
|
+
ggml_set_name(KQ_mask, "KQ_mask");
|
3446
|
+
ggml_allocr_alloc(lctx.alloc, KQ_mask);
|
3447
|
+
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
3448
|
+
float * data = (float *) KQ_mask->data;
|
3449
|
+
memset(data, 0, ggml_nbytes(KQ_mask));
|
3450
|
+
|
3451
|
+
for (int h = 0; h < 1; ++h) {
|
3452
|
+
for (int j = 0; j < n_tokens; ++j) {
|
3453
|
+
const llama_pos pos = batch.pos[j];
|
3454
|
+
const llama_seq_id seq_id = batch.seq_id[j];
|
3455
|
+
|
3456
|
+
for (int i = 0; i < n_kv; ++i) {
|
3457
|
+
if (!kv_self.cells[i].has_seq_id(seq_id) || kv_self.cells[i].pos > pos) {
|
3458
|
+
data[h*(n_kv*n_tokens) + j*n_kv + i] = -INFINITY;
|
3459
|
+
}
|
3460
|
+
}
|
3461
|
+
}
|
3462
|
+
}
|
3463
|
+
}
|
3464
|
+
|
3465
|
+
// KQ_pos - contains the positions
|
3466
|
+
struct ggml_tensor * KQ_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
|
3467
|
+
offload_func_kq(KQ_pos);
|
3468
|
+
ggml_set_name(KQ_pos, "KQ_pos");
|
3469
|
+
ggml_allocr_alloc(lctx.alloc, KQ_pos);
|
3470
|
+
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
3471
|
+
int * data = (int *) KQ_pos->data;
|
3472
|
+
for (int i = 0; i < n_tokens; ++i) {
|
3473
|
+
data[i] = batch.pos[i];
|
3474
|
+
}
|
3475
|
+
}
|
3476
|
+
|
3477
|
+
// shift the entire K-cache if needed
|
3478
|
+
if (do_rope_shift) {
|
3479
|
+
struct ggml_tensor * K_shift = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_ctx);
|
3480
|
+
offload_func_kq(K_shift);
|
3481
|
+
ggml_set_name(K_shift, "K_shift");
|
3482
|
+
ggml_allocr_alloc(lctx.alloc, K_shift);
|
3483
|
+
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
3484
|
+
int * data = (int *) K_shift->data;
|
3485
|
+
for (int i = 0; i < n_ctx; ++i) {
|
3486
|
+
data[i] = kv_self.cells[i].delta;
|
3487
|
+
}
|
3488
|
+
}
|
3489
|
+
|
3490
|
+
for (int il = 0; il < n_layer; ++il) {
|
3491
|
+
struct ggml_tensor * tmp =
|
3492
|
+
ggml_rope_custom_inplace(ctx0,
|
3493
|
+
ggml_view_3d(ctx0, kv_self.k,
|
3494
|
+
n_embd_head, n_head_kv, n_ctx,
|
3495
|
+
ggml_element_size(kv_self.k)*n_embd_head,
|
3496
|
+
ggml_element_size(kv_self.k)*n_embd_gqa,
|
3497
|
+
ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il),
|
3498
|
+
K_shift, n_embd_head, 2, 0, freq_base, freq_scale);
|
3499
|
+
offload_func_kq(tmp);
|
3500
|
+
ggml_build_forward_expand(gf, tmp);
|
3501
|
+
}
|
3502
|
+
}
|
3099
3503
|
|
3100
3504
|
for (int il = 0; il < n_layer; ++il) {
|
3101
3505
|
struct ggml_tensor * attn_norm;
|
@@ -3152,148 +3556,395 @@ static struct ggml_cgraph * llm_build_falcon(
|
|
3152
3556
|
// TODO: these 2 ggml_conts are technically not needed, but we add them until CUDA support for
|
3153
3557
|
// non-contiguous views is added for the rope operator
|
3154
3558
|
struct ggml_tensor * tmpq = ggml_cont(ctx0, ggml_view_3d(
|
3155
|
-
ctx0, cur, n_embd_head, n_head,
|
3559
|
+
ctx0, cur, n_embd_head, n_head, n_tokens,
|
3156
3560
|
wsize * n_embd_head,
|
3157
3561
|
wsize * n_embd_head * (n_head + 2 * n_head_kv),
|
3158
3562
|
0));
|
3159
3563
|
offload_func_kq(tmpq);
|
3160
3564
|
|
3161
|
-
struct ggml_tensor * tmpk = ggml_cont(ctx0, ggml_view_3d(
|
3162
|
-
ctx0, cur, n_embd_head, n_head_kv,
|
3163
|
-
wsize * n_embd_head,
|
3164
|
-
wsize * n_embd_head * (n_head + 2 * n_head_kv),
|
3165
|
-
wsize * n_embd_head * n_head));
|
3166
|
-
offload_func_kq(tmpk);
|
3565
|
+
struct ggml_tensor * tmpk = ggml_cont(ctx0, ggml_view_3d(
|
3566
|
+
ctx0, cur, n_embd_head, n_head_kv, n_tokens,
|
3567
|
+
wsize * n_embd_head,
|
3568
|
+
wsize * n_embd_head * (n_head + 2 * n_head_kv),
|
3569
|
+
wsize * n_embd_head * n_head));
|
3570
|
+
offload_func_kq(tmpk);
|
3571
|
+
|
3572
|
+
struct ggml_tensor * tmpv = ggml_view_3d(
|
3573
|
+
ctx0, cur, n_embd_head, n_head_kv, n_tokens,
|
3574
|
+
wsize * n_embd_head,
|
3575
|
+
wsize * n_embd_head * (n_head + 2 * n_head_kv),
|
3576
|
+
wsize * n_embd_head * (n_head + n_head_kv));
|
3577
|
+
offload_func_v(tmpv);
|
3578
|
+
|
3579
|
+
// using mode = 2 for neox mode
|
3580
|
+
struct ggml_tensor * Qcur = ggml_rope_custom(ctx0, tmpq, KQ_pos, n_embd_head, 2, 0, freq_base, freq_scale);
|
3581
|
+
offload_func_kq(Qcur);
|
3582
|
+
struct ggml_tensor * Kcur = ggml_rope_custom(ctx0, tmpk, KQ_pos, n_embd_head, 2, 0, freq_base, freq_scale);
|
3583
|
+
offload_func_kq(Kcur);
|
3584
|
+
|
3585
|
+
{
|
3586
|
+
struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, ggml_cont(ctx0, tmpv), n_embd_gqa, n_tokens));
|
3587
|
+
offload_func_v(Vcur);
|
3588
|
+
offload_func_v(Vcur->src[0]->src[0]);
|
3589
|
+
ggml_set_name(Vcur, "Vcur");
|
3590
|
+
|
3591
|
+
struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, n_tokens*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + kv_head));
|
3592
|
+
offload_func_kq(k);
|
3593
|
+
ggml_set_name(k, "k");
|
3594
|
+
|
3595
|
+
struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, n_tokens, n_embd_gqa,
|
3596
|
+
( n_ctx)*ggml_element_size(kv_self.v),
|
3597
|
+
(il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + kv_head*ggml_element_size(kv_self.v));
|
3598
|
+
offload_func_v(v);
|
3599
|
+
|
3600
|
+
ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k));
|
3601
|
+
ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v));
|
3602
|
+
}
|
3603
|
+
|
3604
|
+
struct ggml_tensor * Q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3);
|
3605
|
+
offload_func_kq(Q);
|
3606
|
+
ggml_set_name(Q, "Q");
|
3607
|
+
|
3608
|
+
struct ggml_tensor * K =
|
3609
|
+
ggml_view_3d(ctx0, kv_self.k,
|
3610
|
+
n_embd_head, n_kv, n_head_kv,
|
3611
|
+
ggml_element_size(kv_self.k)*n_embd_gqa,
|
3612
|
+
ggml_element_size(kv_self.k)*n_embd_head,
|
3613
|
+
ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il);
|
3614
|
+
offload_func_kq(K);
|
3615
|
+
ggml_set_name(K, "K");
|
3616
|
+
|
3617
|
+
struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
|
3618
|
+
offload_func_kq(KQ);
|
3619
|
+
ggml_set_name(KQ, "KQ");
|
3620
|
+
|
3621
|
+
struct ggml_tensor * KQ_scaled = ggml_scale(ctx0, KQ, KQ_scale);
|
3622
|
+
offload_func_kq(KQ_scaled);
|
3623
|
+
ggml_set_name(KQ_scaled, "KQ_scaled");
|
3624
|
+
|
3625
|
+
struct ggml_tensor * KQ_masked = ggml_add(ctx0, KQ_scaled, KQ_mask);
|
3626
|
+
offload_func_kq(KQ_masked);
|
3627
|
+
ggml_set_name(KQ_masked, "KQ_masked");
|
3628
|
+
|
3629
|
+
struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked);
|
3630
|
+
offload_func_v(KQ_soft_max);
|
3631
|
+
ggml_set_name(KQ_soft_max, "KQ_soft_max");
|
3632
|
+
|
3633
|
+
struct ggml_tensor * V =
|
3634
|
+
ggml_view_3d(ctx0, kv_self.v,
|
3635
|
+
n_kv, n_embd_head, n_head_kv,
|
3636
|
+
ggml_element_size(kv_self.v)*n_ctx,
|
3637
|
+
ggml_element_size(kv_self.v)*n_ctx*n_embd_head,
|
3638
|
+
ggml_element_size(kv_self.v)*n_ctx*n_embd_gqa*il);
|
3639
|
+
offload_func_v(V);
|
3640
|
+
ggml_set_name(V, "V");
|
3641
|
+
|
3642
|
+
struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
|
3643
|
+
offload_func_v(KQV);
|
3644
|
+
ggml_set_name(KQV, "KQV");
|
3645
|
+
|
3646
|
+
struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
|
3647
|
+
offload_func_v(KQV_merged);
|
3648
|
+
ggml_set_name(KQV_merged, "KQV_merged");
|
3649
|
+
|
3650
|
+
cur = ggml_cont_2d(ctx0, KQV_merged, n_embd, n_tokens);
|
3651
|
+
offload_func_v(cur);
|
3652
|
+
ggml_set_name(cur, "KQV_merged_contiguous");
|
3653
|
+
|
3654
|
+
cur = ggml_mul_mat(ctx0, model.layers[il].wo, cur);
|
3655
|
+
offload_func(cur);
|
3656
|
+
ggml_set_name(cur, "result_wo");
|
3657
|
+
}
|
3658
|
+
|
3659
|
+
struct ggml_tensor * attn_out = cur;
|
3660
|
+
|
3661
|
+
// feed forward
|
3662
|
+
{
|
3663
|
+
struct ggml_tensor * inpFF = attn_norm;
|
3664
|
+
|
3665
|
+
cur = ggml_mul_mat(ctx0, model.layers[il].w3, inpFF);
|
3666
|
+
offload_func(cur);
|
3667
|
+
|
3668
|
+
cur = ggml_gelu(ctx0, cur);
|
3669
|
+
offload_func(cur);
|
3670
|
+
cur = ggml_mul_mat(ctx0, model.layers[il].w2, cur);
|
3671
|
+
offload_func(cur);
|
3672
|
+
}
|
3673
|
+
|
3674
|
+
cur = ggml_add(ctx0, cur, attn_out);
|
3675
|
+
offload_func(cur);
|
3676
|
+
cur = ggml_add(ctx0, cur, inpL);
|
3677
|
+
offload_func(cur);
|
3678
|
+
|
3679
|
+
// input for next layer
|
3680
|
+
inpL = cur;
|
3681
|
+
}
|
3682
|
+
|
3683
|
+
cur = inpL;
|
3684
|
+
|
3685
|
+
// norm
|
3686
|
+
{
|
3687
|
+
cur = ggml_norm(ctx0, cur, norm_eps);
|
3688
|
+
offload_func_nr(cur);
|
3689
|
+
|
3690
|
+
cur = ggml_add(ctx0,
|
3691
|
+
ggml_mul(ctx0, cur, model.output_norm),
|
3692
|
+
model.output_norm_b);
|
3693
|
+
ggml_set_name(cur, "result_norm");
|
3694
|
+
}
|
3695
|
+
|
3696
|
+
cur = ggml_mul_mat(ctx0, model.output, cur);
|
3697
|
+
ggml_set_name(cur, "result_output");
|
3698
|
+
|
3699
|
+
ggml_build_forward_expand(gf, cur);
|
3700
|
+
|
3701
|
+
ggml_free(ctx0);
|
3702
|
+
|
3703
|
+
return gf;
|
3704
|
+
}
|
3705
|
+
|
3706
|
+
static struct ggml_cgraph * llm_build_starcoder(
|
3707
|
+
llama_context & lctx,
|
3708
|
+
const llama_batch & batch) {
|
3709
|
+
const auto & model = lctx.model;
|
3710
|
+
const auto & hparams = model.hparams;
|
3711
|
+
const auto & cparams = lctx.cparams;
|
3712
|
+
|
3713
|
+
const auto & kv_self = lctx.kv_self;
|
3714
|
+
|
3715
|
+
GGML_ASSERT(!!kv_self.ctx);
|
3716
|
+
|
3717
|
+
const int64_t n_embd = hparams.n_embd;
|
3718
|
+
const int64_t n_layer = hparams.n_layer;
|
3719
|
+
const int64_t n_ctx = cparams.n_ctx;
|
3720
|
+
const int64_t n_head = hparams.n_head;
|
3721
|
+
const int64_t n_head_kv = hparams.n_head_kv;
|
3722
|
+
const int64_t n_embd_head = hparams.n_embd_head();
|
3723
|
+
const int64_t n_embd_gqa = hparams.n_embd_gqa();
|
3724
|
+
|
3725
|
+
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
3726
|
+
|
3727
|
+
const float norm_eps = hparams.f_norm_eps;
|
3728
|
+
|
3729
|
+
const int32_t n_tokens = batch.n_tokens;
|
3730
|
+
const int32_t n_kv = ggml_allocr_is_measure(lctx.alloc) ? n_ctx : kv_self.n;
|
3731
|
+
const int32_t kv_head = ggml_allocr_is_measure(lctx.alloc) ? n_ctx - n_tokens : kv_self.head;
|
3732
|
+
|
3733
|
+
auto & buf_compute = lctx.buf_compute;
|
3734
|
+
|
3735
|
+
struct ggml_init_params params = {
|
3736
|
+
/*.mem_size =*/ buf_compute.size,
|
3737
|
+
/*.mem_buffer =*/ buf_compute.data,
|
3738
|
+
/*.no_alloc =*/ false,
|
3739
|
+
};
|
3740
|
+
|
3741
|
+
params.no_alloc = true;
|
3742
|
+
|
3743
|
+
struct ggml_context * ctx0 = ggml_init(params);
|
3744
|
+
|
3745
|
+
ggml_cgraph * gf = ggml_new_graph(ctx0);
|
3746
|
+
|
3747
|
+
struct ggml_tensor * cur;
|
3748
|
+
struct ggml_tensor * token;
|
3749
|
+
struct ggml_tensor * position;
|
3750
|
+
struct ggml_tensor * inpL;
|
3751
|
+
|
3752
|
+
if (batch.token) {
|
3753
|
+
struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
|
3754
|
+
|
3755
|
+
ggml_allocr_alloc(lctx.alloc, inp_tokens);
|
3756
|
+
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
3757
|
+
memcpy(inp_tokens->data, batch.token, n_tokens*ggml_element_size(inp_tokens));
|
3758
|
+
}
|
3759
|
+
ggml_set_name(inp_tokens, "inp_tokens");
|
3760
|
+
|
3761
|
+
token = ggml_get_rows(ctx0, model.tok_embeddings, inp_tokens);
|
3762
|
+
} else {
|
3763
|
+
#ifdef GGML_USE_MPI
|
3764
|
+
GGML_ASSERT(false && "not implemented");
|
3765
|
+
#endif
|
3766
|
+
|
3767
|
+
token = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_tokens);
|
3768
|
+
|
3769
|
+
ggml_allocr_alloc(lctx.alloc, token);
|
3770
|
+
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
3771
|
+
memcpy(token->data, batch.embd, n_tokens * n_embd * ggml_element_size(token));
|
3772
|
+
}
|
3773
|
+
}
|
3774
|
+
|
3775
|
+
{
|
3776
|
+
// Compute position embeddings.
|
3777
|
+
struct ggml_tensor * inp_positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
|
3778
|
+
ggml_allocr_alloc(lctx.alloc, inp_positions);
|
3779
|
+
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
3780
|
+
for (int i = 0; i < n_tokens; ++i) {
|
3781
|
+
((int32_t *) inp_positions->data)[i] = batch.pos[i];
|
3782
|
+
}
|
3783
|
+
}
|
3784
|
+
ggml_set_name(inp_positions, "inp_positions");
|
3785
|
+
|
3786
|
+
position = ggml_get_rows(ctx0, model.pos_embeddings, inp_positions);
|
3787
|
+
}
|
3788
|
+
|
3789
|
+
// KQ_scale
|
3790
|
+
struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
|
3791
|
+
ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)");
|
3792
|
+
ggml_allocr_alloc(lctx.alloc, KQ_scale);
|
3793
|
+
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
3794
|
+
ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd)/n_head));
|
3795
|
+
}
|
3796
|
+
|
3797
|
+
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
3798
|
+
struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
|
3799
|
+
ggml_set_name(KQ_mask, "KQ_mask");
|
3800
|
+
ggml_allocr_alloc(lctx.alloc, KQ_mask);
|
3801
|
+
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
3802
|
+
float * data = (float *) KQ_mask->data;
|
3803
|
+
memset(data, 0, ggml_nbytes(KQ_mask));
|
3804
|
+
|
3805
|
+
for (int h = 0; h < 1; ++h) {
|
3806
|
+
for (int j = 0; j < n_tokens; ++j) {
|
3807
|
+
const llama_pos pos = batch.pos[j];
|
3808
|
+
const llama_seq_id seq_id = batch.seq_id[j];
|
3809
|
+
|
3810
|
+
for (int i = 0; i < n_kv; ++i) {
|
3811
|
+
if (!kv_self.cells[i].has_seq_id(seq_id) || kv_self.cells[i].pos > pos) {
|
3812
|
+
data[h*(n_kv*n_tokens) + j*n_kv + i] = -INFINITY;
|
3813
|
+
}
|
3814
|
+
}
|
3815
|
+
}
|
3816
|
+
}
|
3817
|
+
}
|
3818
|
+
|
3819
|
+
inpL = ggml_add(ctx0, token, position);
|
3820
|
+
ggml_set_name(inpL, "inpL");
|
3167
3821
|
|
3168
|
-
|
3169
|
-
|
3170
|
-
|
3171
|
-
|
3172
|
-
|
3173
|
-
|
3822
|
+
for (int il = 0; il < n_layer; ++il) {
|
3823
|
+
{
|
3824
|
+
// Norm
|
3825
|
+
cur = ggml_norm(ctx0, inpL, norm_eps);
|
3826
|
+
cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.layers[il].attn_norm), model.layers[il].attn_norm_b);
|
3827
|
+
}
|
3174
3828
|
|
3175
|
-
|
3176
|
-
|
3177
|
-
|
3178
|
-
|
3179
|
-
|
3829
|
+
{
|
3830
|
+
// Self Attention
|
3831
|
+
cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wqkv, cur), model.layers[il].bqkv);
|
3832
|
+
|
3833
|
+
struct ggml_tensor * tmpq = ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*n_embd);
|
3834
|
+
struct ggml_tensor * tmpk = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], sizeof(float)*n_embd);
|
3835
|
+
struct ggml_tensor * tmpv = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], sizeof(float)*(n_embd + n_embd_gqa));
|
3836
|
+
|
3837
|
+
struct ggml_tensor * Qcur = tmpq;
|
3838
|
+
struct ggml_tensor * Kcur = tmpk;
|
3180
3839
|
|
3181
3840
|
{
|
3182
|
-
struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, ggml_cont(ctx0, tmpv), n_embd_gqa,
|
3183
|
-
offload_func_v(Vcur);
|
3184
|
-
offload_func_v(Vcur->src[0]->src[0]);
|
3841
|
+
struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, ggml_cont(ctx0, tmpv), n_embd_gqa, n_tokens));
|
3185
3842
|
ggml_set_name(Vcur, "Vcur");
|
3186
3843
|
|
3187
|
-
struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k,
|
3188
|
-
offload_func_kq(k);
|
3844
|
+
struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, n_tokens*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + kv_head));
|
3189
3845
|
ggml_set_name(k, "k");
|
3190
3846
|
|
3191
|
-
struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v,
|
3847
|
+
struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, n_tokens, n_embd_gqa,
|
3192
3848
|
( n_ctx)*ggml_element_size(kv_self.v),
|
3193
|
-
(il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa +
|
3194
|
-
offload_func_v(v);
|
3849
|
+
(il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + kv_head*ggml_element_size(kv_self.v));
|
3195
3850
|
|
3196
3851
|
ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k));
|
3197
3852
|
ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v));
|
3198
3853
|
}
|
3199
3854
|
|
3200
|
-
struct ggml_tensor * Q =
|
3201
|
-
|
3855
|
+
struct ggml_tensor * Q =
|
3856
|
+
ggml_permute(ctx0,
|
3857
|
+
ggml_cpy(ctx0,
|
3858
|
+
Qcur,
|
3859
|
+
ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_embd_head, n_head, n_tokens)),
|
3860
|
+
0, 2, 1, 3);
|
3202
3861
|
ggml_set_name(Q, "Q");
|
3203
3862
|
|
3204
3863
|
struct ggml_tensor * K =
|
3205
3864
|
ggml_view_3d(ctx0, kv_self.k,
|
3206
|
-
n_embd_head,
|
3865
|
+
n_embd_head, n_kv, n_head_kv,
|
3207
3866
|
ggml_element_size(kv_self.k)*n_embd_gqa,
|
3208
3867
|
ggml_element_size(kv_self.k)*n_embd_head,
|
3209
3868
|
ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il);
|
3210
|
-
offload_func_kq(K);
|
3211
3869
|
ggml_set_name(K, "K");
|
3212
3870
|
|
3871
|
+
// K * Q
|
3213
3872
|
struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
|
3214
|
-
offload_func_kq(KQ);
|
3215
3873
|
ggml_set_name(KQ, "KQ");
|
3216
3874
|
|
3875
|
+
// KQ_scaled = KQ / sqrt(n_embd_head)
|
3876
|
+
// KQ_scaled shape [n_past + n_tokens, n_tokens, n_head, 1]
|
3217
3877
|
struct ggml_tensor * KQ_scaled = ggml_scale_inplace(ctx0, KQ, KQ_scale);
|
3218
|
-
offload_func_kq(KQ_scaled);
|
3219
3878
|
ggml_set_name(KQ_scaled, "KQ_scaled");
|
3220
3879
|
|
3221
|
-
|
3222
|
-
|
3880
|
+
// KQ_masked = mask_past(KQ_scaled)
|
3881
|
+
struct ggml_tensor * KQ_masked = ggml_add(ctx0, KQ_scaled, KQ_mask);
|
3223
3882
|
ggml_set_name(KQ_masked, "KQ_masked");
|
3224
3883
|
|
3884
|
+
// KQ = soft_max(KQ_masked)
|
3225
3885
|
struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked);
|
3226
|
-
offload_func_v(KQ_soft_max);
|
3227
3886
|
ggml_set_name(KQ_soft_max, "KQ_soft_max");
|
3228
3887
|
|
3888
|
+
// split cached V into n_head heads
|
3229
3889
|
struct ggml_tensor * V =
|
3230
3890
|
ggml_view_3d(ctx0, kv_self.v,
|
3231
|
-
|
3891
|
+
n_kv, n_embd_head, n_head_kv,
|
3232
3892
|
ggml_element_size(kv_self.v)*n_ctx,
|
3233
3893
|
ggml_element_size(kv_self.v)*n_ctx*n_embd_head,
|
3234
3894
|
ggml_element_size(kv_self.v)*n_ctx*n_embd_gqa*il);
|
3235
|
-
offload_func_v(V);
|
3236
3895
|
ggml_set_name(V, "V");
|
3237
3896
|
|
3238
3897
|
struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
|
3239
|
-
offload_func_v(KQV);
|
3240
3898
|
ggml_set_name(KQV, "KQV");
|
3241
3899
|
|
3900
|
+
// KQV_merged = KQV.permute(0, 2, 1, 3)
|
3242
3901
|
struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
|
3243
|
-
offload_func_v(KQV_merged);
|
3244
3902
|
ggml_set_name(KQV_merged, "KQV_merged");
|
3245
3903
|
|
3246
|
-
cur =
|
3247
|
-
|
3904
|
+
// cur = KQV_merged.contiguous().view(n_embd, n_tokens)
|
3905
|
+
cur = ggml_cont_2d(ctx0, KQV_merged, n_embd, n_tokens);
|
3248
3906
|
ggml_set_name(cur, "KQV_merged_contiguous");
|
3249
|
-
|
3250
|
-
cur = ggml_mul_mat(ctx0, model.layers[il].wo, cur);
|
3251
|
-
offload_func(cur);
|
3252
|
-
ggml_set_name(cur, "result_wo");
|
3253
3907
|
}
|
3254
3908
|
|
3255
|
-
|
3909
|
+
// Projection
|
3910
|
+
cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wo, cur), model.layers[il].bo);
|
3256
3911
|
|
3257
|
-
//
|
3912
|
+
// Add the input
|
3913
|
+
cur = ggml_add(ctx0, cur, inpL);
|
3914
|
+
|
3915
|
+
struct ggml_tensor * inpFF = cur;
|
3916
|
+
|
3917
|
+
// FF
|
3258
3918
|
{
|
3259
|
-
|
3919
|
+
// Norm
|
3920
|
+
{
|
3921
|
+
cur = ggml_norm(ctx0, inpFF, norm_eps);
|
3922
|
+
cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.layers[il].ffn_norm), model.layers[il].ffn_norm_b);
|
3923
|
+
}
|
3260
3924
|
|
3261
|
-
cur = ggml_mul_mat(ctx0, model.layers[il].w3,
|
3262
|
-
offload_func(cur);
|
3925
|
+
cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].w3, cur), model.layers[il].b3);
|
3263
3926
|
|
3927
|
+
// GELU activation
|
3264
3928
|
cur = ggml_gelu(ctx0, cur);
|
3265
|
-
offload_func(cur);
|
3266
|
-
cur = ggml_mul_mat(ctx0, model.layers[il].w2, cur);
|
3267
|
-
offload_func(cur);
|
3268
|
-
}
|
3269
3929
|
|
3270
|
-
|
3271
|
-
|
3272
|
-
|
3273
|
-
offload_func(cur);
|
3930
|
+
// Projection
|
3931
|
+
cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].w2, cur), model.layers[il].b2);
|
3932
|
+
}
|
3274
3933
|
|
3275
|
-
|
3276
|
-
inpL = cur;
|
3934
|
+
inpL = ggml_add(ctx0, cur, inpFF);
|
3277
3935
|
}
|
3278
3936
|
|
3279
|
-
|
3280
|
-
|
3281
|
-
// norm
|
3937
|
+
// Output Norm
|
3282
3938
|
{
|
3283
|
-
cur = ggml_norm(ctx0,
|
3284
|
-
|
3285
|
-
|
3286
|
-
cur = ggml_add(ctx0,
|
3287
|
-
ggml_mul(ctx0, cur, model.output_norm),
|
3288
|
-
model.output_norm_b);
|
3289
|
-
ggml_set_name(cur, "result_norm");
|
3939
|
+
cur = ggml_norm(ctx0, inpL, norm_eps);
|
3940
|
+
cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.output_norm), model.output_norm_b);
|
3290
3941
|
}
|
3942
|
+
ggml_set_name(cur, "result_norm");
|
3291
3943
|
|
3292
3944
|
cur = ggml_mul_mat(ctx0, model.output, cur);
|
3293
3945
|
ggml_set_name(cur, "result_output");
|
3294
3946
|
|
3295
3947
|
ggml_build_forward_expand(gf, cur);
|
3296
|
-
|
3297
3948
|
ggml_free(ctx0);
|
3298
3949
|
|
3299
3950
|
return gf;
|
@@ -3301,10 +3952,7 @@ static struct ggml_cgraph * llm_build_falcon(
|
|
3301
3952
|
|
3302
3953
|
static struct ggml_cgraph * llama_build_graph(
|
3303
3954
|
llama_context & lctx,
|
3304
|
-
const
|
3305
|
-
const float * embd,
|
3306
|
-
int n_tokens,
|
3307
|
-
int n_past) {
|
3955
|
+
const llama_batch & batch) {
|
3308
3956
|
const auto & model = lctx.model;
|
3309
3957
|
|
3310
3958
|
struct ggml_cgraph * result = NULL;
|
@@ -3312,72 +3960,117 @@ static struct ggml_cgraph * llama_build_graph(
|
|
3312
3960
|
switch (model.arch) {
|
3313
3961
|
case LLM_ARCH_LLAMA:
|
3314
3962
|
{
|
3315
|
-
result = llm_build_llama(lctx,
|
3963
|
+
result = llm_build_llama(lctx, batch);
|
3316
3964
|
} break;
|
3317
3965
|
case LLM_ARCH_BAICHUAN:
|
3318
3966
|
{
|
3319
|
-
result = llm_build_baichaun(lctx,
|
3967
|
+
result = llm_build_baichaun(lctx, batch);
|
3320
3968
|
} break;
|
3321
3969
|
case LLM_ARCH_FALCON:
|
3322
3970
|
{
|
3323
|
-
result = llm_build_falcon(lctx,
|
3971
|
+
result = llm_build_falcon(lctx, batch);
|
3972
|
+
} break;
|
3973
|
+
case LLM_ARCH_STARCODER:
|
3974
|
+
{
|
3975
|
+
result = llm_build_starcoder(lctx, batch);
|
3324
3976
|
} break;
|
3325
3977
|
default:
|
3326
3978
|
GGML_ASSERT(false);
|
3327
|
-
}
|
3979
|
+
}
|
3328
3980
|
|
3329
3981
|
return result;
|
3330
3982
|
}
|
3331
3983
|
|
3332
|
-
//
|
3984
|
+
// decode a batch of tokens by evaluating the transformer
|
3333
3985
|
//
|
3334
3986
|
// - lctx: llama context
|
3335
|
-
// -
|
3336
|
-
// - embd embeddings input
|
3337
|
-
// - n_tokens number of tokens
|
3338
|
-
// - n_past: the context size so far
|
3987
|
+
// - batch: batch to evaluate
|
3339
3988
|
// - n_threads: number of threads to use
|
3340
3989
|
//
|
3341
|
-
|
3990
|
+
// return 0 on success
|
3991
|
+
// return positive int on warning
|
3992
|
+
// return negative int on error
|
3993
|
+
//
|
3994
|
+
static int llama_decode_internal(
|
3342
3995
|
llama_context & lctx,
|
3343
|
-
|
3344
|
-
|
3345
|
-
|
3346
|
-
|
3347
|
-
|
3348
|
-
|
3996
|
+
llama_batch batch) {
|
3997
|
+
const uint32_t n_tokens = batch.n_tokens;
|
3998
|
+
|
3999
|
+
if (n_tokens == 0) {
|
4000
|
+
LLAMA_LOG_ERROR("%s: n_tokens == 0", __func__);
|
4001
|
+
return -1;
|
4002
|
+
}
|
4003
|
+
|
4004
|
+
const auto & model = lctx.model;
|
4005
|
+
const auto & hparams = model.hparams;
|
4006
|
+
const auto & cparams = lctx.cparams;
|
3349
4007
|
|
3350
|
-
|
4008
|
+
const auto n_batch = cparams.n_batch;
|
3351
4009
|
|
3352
|
-
GGML_ASSERT(n_tokens
|
3353
|
-
|
3354
|
-
|
3355
|
-
|
3356
|
-
// GGML_ASSERT(n_past + n_tokens <= n_ctx);
|
4010
|
+
GGML_ASSERT(n_tokens <= n_batch);
|
4011
|
+
|
4012
|
+
int n_threads = n_tokens == 1 ? cparams.n_threads : cparams.n_threads_batch;
|
4013
|
+
GGML_ASSERT((!batch.token && batch.embd) || (batch.token && !batch.embd)); // NOLINT
|
3357
4014
|
|
3358
4015
|
const int64_t t_start_us = ggml_time_us();
|
3359
4016
|
|
3360
4017
|
#ifdef GGML_USE_MPI
|
3361
|
-
|
4018
|
+
// TODO: needs fix after #3228
|
4019
|
+
GGML_ASSERT(false && "not implemented");
|
4020
|
+
//ggml_mpi_eval_init(lctx.ctx_mpi, &n_tokens, &n_past, &n_threads);
|
3362
4021
|
#endif
|
3363
4022
|
|
3364
4023
|
GGML_ASSERT(n_threads > 0);
|
3365
4024
|
|
3366
|
-
|
3367
|
-
|
3368
|
-
const auto & model = lctx.model;
|
3369
|
-
const auto & hparams = model.hparams;
|
3370
|
-
|
3371
|
-
const auto & kv_self = lctx.kv_self;
|
4025
|
+
auto & kv_self = lctx.kv_self;
|
3372
4026
|
|
3373
4027
|
GGML_ASSERT(!!kv_self.ctx);
|
3374
4028
|
|
3375
4029
|
const int64_t n_embd = hparams.n_embd;
|
3376
4030
|
const int64_t n_vocab = hparams.n_vocab;
|
3377
4031
|
|
4032
|
+
// helpers for smoother batch API transistion
|
4033
|
+
// after deprecating the llama_eval calls, these will be removed
|
4034
|
+
std::vector<llama_pos> pos;
|
4035
|
+
std::vector<llama_seq_id> seq_id;
|
4036
|
+
|
4037
|
+
if (batch.pos == nullptr) {
|
4038
|
+
pos.resize(n_tokens);
|
4039
|
+
for (uint32_t i = 0; i < n_tokens; i++) {
|
4040
|
+
pos[i] = batch.all_pos_0 + i*batch.all_pos_1;
|
4041
|
+
}
|
4042
|
+
|
4043
|
+
batch.pos = pos.data();
|
4044
|
+
}
|
4045
|
+
|
4046
|
+
if (batch.seq_id == nullptr) {
|
4047
|
+
seq_id.resize(n_tokens);
|
4048
|
+
for (uint32_t i = 0; i < n_tokens; i++) {
|
4049
|
+
seq_id[i] = batch.all_seq_id;
|
4050
|
+
}
|
4051
|
+
|
4052
|
+
batch.seq_id = seq_id.data();
|
4053
|
+
}
|
4054
|
+
|
4055
|
+
// we always start to search for a free slot from the start of the cache
|
4056
|
+
// TODO: better strategies can be implemented
|
4057
|
+
kv_self.head = 0;
|
4058
|
+
|
4059
|
+
if (!llama_kv_cache_find_slot(kv_self, batch)) {
|
4060
|
+
return 1;
|
4061
|
+
}
|
4062
|
+
|
4063
|
+
// a heuristic, to avoid attending the full cache if it is not yet utilized
|
4064
|
+
// after enough generations, the benefit from this heuristic disappears
|
4065
|
+
// if we start defragmenting the cache, the benefit from this will be more important
|
4066
|
+
//kv_self.n = std::max(32, GGML_PAD(llama_kv_cache_cell_max(kv_self), 32)); // TODO: this might be better for CUDA?
|
4067
|
+
kv_self.n = std::min((int32_t) cparams.n_ctx, std::max(32, llama_kv_cache_cell_max(kv_self)));
|
4068
|
+
|
4069
|
+
//printf("kv_self.n = %d\n", kv_self.n);
|
4070
|
+
|
3378
4071
|
ggml_allocr_reset(lctx.alloc);
|
3379
4072
|
|
3380
|
-
ggml_cgraph * gf = llama_build_graph(lctx,
|
4073
|
+
ggml_cgraph * gf = llama_build_graph(lctx, batch);
|
3381
4074
|
|
3382
4075
|
ggml_allocr_alloc_graph(lctx.alloc, gf);
|
3383
4076
|
|
@@ -3386,6 +4079,7 @@ static bool llama_eval_internal(
|
|
3386
4079
|
ggml_tensor * node = gf->leafs[i];
|
3387
4080
|
if (node->backend == GGML_BACKEND_GPU && node->extra == NULL) {
|
3388
4081
|
ggml_cuda_assign_scratch_offset(node, (char*)node->data - (char *) lctx.buf_alloc.data);
|
4082
|
+
ggml_cuda_copy_to_device(node);
|
3389
4083
|
}
|
3390
4084
|
}
|
3391
4085
|
|
@@ -3395,6 +4089,8 @@ static bool llama_eval_internal(
|
|
3395
4089
|
ggml_cuda_assign_scratch_offset(node, (char*)node->data - (char *) lctx.buf_alloc.data);
|
3396
4090
|
}
|
3397
4091
|
}
|
4092
|
+
|
4093
|
+
ggml_cuda_set_mul_mat_q(cparams.mul_mat_q);
|
3398
4094
|
#endif
|
3399
4095
|
|
3400
4096
|
// LLAMA_LOG_INFO("graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs);
|
@@ -3404,10 +4100,19 @@ static bool llama_eval_internal(
|
|
3404
4100
|
// TODO: this is mostly important for Apple Silicon where CBLAS is still performing very well
|
3405
4101
|
// we still need some threads to process all non-mul_mat ops, but not too much to avoid interfering
|
3406
4102
|
// with the BLAS calls. need a better solution
|
3407
|
-
if (
|
4103
|
+
if (n_tokens >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas()) {
|
3408
4104
|
n_threads = std::min(4, n_threads);
|
3409
4105
|
}
|
3410
4106
|
|
4107
|
+
// If all tensors can be run on the GPU then using more than 1 thread is detrimental.
|
4108
|
+
const bool full_offload_supported = model.arch == LLM_ARCH_LLAMA ||
|
4109
|
+
model.arch == LLM_ARCH_BAICHUAN ||
|
4110
|
+
model.arch == LLM_ARCH_FALCON;
|
4111
|
+
const bool fully_offloaded = model.n_gpu_layers >= (int) hparams.n_layer + 3;
|
4112
|
+
if (ggml_cpu_has_cublas() && full_offload_supported && fully_offloaded) {
|
4113
|
+
n_threads = 1;
|
4114
|
+
}
|
4115
|
+
|
3411
4116
|
struct ggml_tensor * res = gf->nodes[gf->n_nodes - 1];
|
3412
4117
|
struct ggml_tensor * embeddings = gf->nodes[gf->n_nodes - 2];
|
3413
4118
|
|
@@ -3423,10 +4128,6 @@ static bool llama_eval_internal(
|
|
3423
4128
|
if (lctx.ctx_metal) {
|
3424
4129
|
ggml_metal_set_n_cb (lctx.ctx_metal, n_threads);
|
3425
4130
|
ggml_metal_graph_compute(lctx.ctx_metal, gf);
|
3426
|
-
ggml_metal_get_tensor (lctx.ctx_metal, res);
|
3427
|
-
if (!lctx.embedding.empty()) {
|
3428
|
-
ggml_metal_get_tensor(lctx.ctx_metal, embeddings);
|
3429
|
-
}
|
3430
4131
|
} else {
|
3431
4132
|
ggml_graph_compute_helper(lctx.work_buffer, gf, n_threads);
|
3432
4133
|
}
|
@@ -3438,12 +4139,9 @@ static bool llama_eval_internal(
|
|
3438
4139
|
ggml_mpi_graph_compute_post(lctx.ctx_mpi, gf, n_layer);
|
3439
4140
|
#endif
|
3440
4141
|
|
3441
|
-
// update kv
|
3442
|
-
lctx.kv_self.
|
3443
|
-
|
3444
|
-
if (cgraph_fname) {
|
3445
|
-
ggml_graph_export(gf, cgraph_fname);
|
3446
|
-
}
|
4142
|
+
// update the kv ring buffer
|
4143
|
+
lctx.kv_self.head += n_tokens;
|
4144
|
+
lctx.kv_self.has_shift = false;
|
3447
4145
|
|
3448
4146
|
#ifdef GGML_PERF
|
3449
4147
|
// print timing information per ggml operation (for debugging purposes)
|
@@ -3460,13 +4158,20 @@ static bool llama_eval_internal(
|
|
3460
4158
|
{
|
3461
4159
|
auto & logits_out = lctx.logits;
|
3462
4160
|
|
3463
|
-
if (
|
3464
|
-
logits_out.resize(n_vocab *
|
3465
|
-
|
4161
|
+
if (batch.logits) {
|
4162
|
+
logits_out.resize(n_vocab * n_tokens);
|
4163
|
+
for (uint32_t i = 0; i < n_tokens; i++) {
|
4164
|
+
if (batch.logits[i] == 0) {
|
4165
|
+
continue;
|
4166
|
+
}
|
4167
|
+
memcpy(logits_out.data() + (n_vocab*i), (float *) ggml_get_data(res) + (n_vocab*i), sizeof(float)*n_vocab);
|
4168
|
+
}
|
4169
|
+
} else if (lctx.logits_all) {
|
4170
|
+
logits_out.resize(n_vocab * n_tokens);
|
4171
|
+
memcpy(logits_out.data(), (float *) ggml_get_data(res), sizeof(float)*n_vocab*n_tokens);
|
3466
4172
|
} else {
|
3467
|
-
// return result for just the last token
|
3468
4173
|
logits_out.resize(n_vocab);
|
3469
|
-
memcpy(logits_out.data(), (float *) ggml_get_data(res) + (n_vocab*(
|
4174
|
+
memcpy(logits_out.data(), (float *) ggml_get_data(res) + (n_vocab*(n_tokens - 1)), sizeof(float)*n_vocab);
|
3470
4175
|
}
|
3471
4176
|
}
|
3472
4177
|
|
@@ -3475,20 +4180,27 @@ static bool llama_eval_internal(
|
|
3475
4180
|
auto & embedding_out = lctx.embedding;
|
3476
4181
|
|
3477
4182
|
embedding_out.resize(n_embd);
|
3478
|
-
memcpy(embedding_out.data(), (float *) ggml_get_data(embeddings) + (n_embd*(
|
4183
|
+
memcpy(embedding_out.data(), (float *) ggml_get_data(embeddings) + (n_embd*(n_tokens - 1)), sizeof(float)*n_embd);
|
3479
4184
|
}
|
3480
4185
|
|
3481
4186
|
// measure the performance only for the single-token evals
|
3482
|
-
if (
|
4187
|
+
if (n_tokens == 1) {
|
3483
4188
|
lctx.t_eval_us += ggml_time_us() - t_start_us;
|
3484
4189
|
lctx.n_eval++;
|
3485
4190
|
}
|
3486
|
-
else if (
|
4191
|
+
else if (n_tokens > 1) {
|
3487
4192
|
lctx.t_p_eval_us += ggml_time_us() - t_start_us;
|
3488
|
-
lctx.n_p_eval +=
|
4193
|
+
lctx.n_p_eval += n_tokens;
|
3489
4194
|
}
|
3490
4195
|
|
3491
|
-
|
4196
|
+
// get a more accurate load time, upon first eval
|
4197
|
+
// TODO: fix this
|
4198
|
+
if (!lctx.has_evaluated_once) {
|
4199
|
+
lctx.t_load_us = ggml_time_us() - lctx.t_start_us;
|
4200
|
+
lctx.has_evaluated_once = true;
|
4201
|
+
}
|
4202
|
+
|
4203
|
+
return 0;
|
3492
4204
|
}
|
3493
4205
|
|
3494
4206
|
//
|
@@ -3909,7 +4621,7 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
|
|
3909
4621
|
llm_tokenizer_bpe tokenizer(vocab);
|
3910
4622
|
tokenizer.tokenize(raw_text, output);
|
3911
4623
|
} break;
|
3912
|
-
}
|
4624
|
+
}
|
3913
4625
|
|
3914
4626
|
return output;
|
3915
4627
|
}
|
@@ -3939,7 +4651,7 @@ struct llama_grammar_candidate {
|
|
3939
4651
|
|
3940
4652
|
// Decodes a UTF-8 string which may end in an incomplete sequence. Adds a terminating 0 for use as
|
3941
4653
|
// pointer. If an invalid sequence is encountered, returns `llama_partial_utf8.n_remain == -1`.
|
3942
|
-
std::pair<std::vector<uint32_t>, llama_partial_utf8> decode_utf8(
|
4654
|
+
static std::pair<std::vector<uint32_t>, llama_partial_utf8> decode_utf8(
|
3943
4655
|
const char * src,
|
3944
4656
|
llama_partial_utf8 partial_start) {
|
3945
4657
|
static const int lookup[] = { 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 2, 2, 3, 4 };
|
@@ -4313,6 +5025,13 @@ struct llama_grammar * llama_grammar_copy(const struct llama_grammar * grammar)
|
|
4313
5025
|
// sampling
|
4314
5026
|
//
|
4315
5027
|
|
5028
|
+
void llama_set_rng_seed(struct llama_context * ctx, uint32_t seed) {
|
5029
|
+
if (seed == LLAMA_DEFAULT_SEED) {
|
5030
|
+
seed = time(NULL);
|
5031
|
+
}
|
5032
|
+
ctx->rng.seed(seed);
|
5033
|
+
}
|
5034
|
+
|
4316
5035
|
void llama_sample_softmax(struct llama_context * ctx, llama_token_data_array * candidates) {
|
4317
5036
|
GGML_ASSERT(candidates->size > 0);
|
4318
5037
|
|
@@ -4521,7 +5240,7 @@ void llama_sample_typical(struct llama_context * ctx, llama_token_data_array * c
|
|
4521
5240
|
}
|
4522
5241
|
}
|
4523
5242
|
|
4524
|
-
void
|
5243
|
+
void llama_sample_temp(struct llama_context * ctx, llama_token_data_array * candidates_p, float temp) {
|
4525
5244
|
const int64_t t_start_sample_us = ggml_time_us();
|
4526
5245
|
|
4527
5246
|
for (size_t i = 0; i < candidates_p->size; ++i) {
|
@@ -4533,6 +5252,10 @@ void llama_sample_temperature(struct llama_context * ctx, llama_token_data_array
|
|
4533
5252
|
}
|
4534
5253
|
}
|
4535
5254
|
|
5255
|
+
void llama_sample_temperature(struct llama_context * ctx, llama_token_data_array * candidates_p, float temp) {
|
5256
|
+
llama_sample_temp(ctx, candidates_p, temp);
|
5257
|
+
}
|
5258
|
+
|
4536
5259
|
void llama_sample_repetition_penalty(struct llama_context * ctx, llama_token_data_array * candidates, const llama_token * last_tokens, size_t last_tokens_size, float penalty) {
|
4537
5260
|
if (last_tokens_size == 0 || penalty == 1.0f) {
|
4538
5261
|
return;
|
@@ -4656,7 +5379,7 @@ void llama_sample_classifier_free_guidance(
|
|
4656
5379
|
|
4657
5380
|
GGML_ASSERT(ctx);
|
4658
5381
|
|
4659
|
-
auto n_vocab = llama_n_vocab(ctx);
|
5382
|
+
auto n_vocab = llama_n_vocab(llama_get_model(ctx));
|
4660
5383
|
|
4661
5384
|
GGML_ASSERT(n_vocab == (int)candidates->size);
|
4662
5385
|
GGML_ASSERT(!candidates->sorted);
|
@@ -4685,7 +5408,7 @@ void llama_sample_classifier_free_guidance(
|
|
4685
5408
|
llama_token llama_sample_token_mirostat(struct llama_context * ctx, llama_token_data_array * candidates, float tau, float eta, int m, float * mu) {
|
4686
5409
|
GGML_ASSERT(ctx);
|
4687
5410
|
|
4688
|
-
auto N = float(llama_n_vocab(ctx));
|
5411
|
+
auto N = float(llama_n_vocab(llama_get_model(ctx)));
|
4689
5412
|
int64_t t_start_sample_us;
|
4690
5413
|
t_start_sample_us = ggml_time_us();
|
4691
5414
|
|
@@ -4872,7 +5595,7 @@ struct llama_logit_info {
|
|
4872
5595
|
};
|
4873
5596
|
llama_logit_info(llama_context * ctx)
|
4874
5597
|
: logits(llama_get_logits(ctx))
|
4875
|
-
, n_vocab(llama_n_vocab(ctx))
|
5598
|
+
, n_vocab(llama_n_vocab(llama_get_model(ctx)))
|
4876
5599
|
, max_l(*std::max_element(logits, logits + n_vocab))
|
4877
5600
|
, normalizer(1.0f / std::accumulate(logits, logits + n_vocab, 0.0f, sum_exp{max_l}))
|
4878
5601
|
{ }
|
@@ -4910,7 +5633,6 @@ struct llama_beam_search_data {
|
|
4910
5633
|
size_t n_beams;
|
4911
5634
|
int n_past;
|
4912
5635
|
int n_predict;
|
4913
|
-
int n_threads;
|
4914
5636
|
std::vector<llama_beam> beams;
|
4915
5637
|
std::vector<llama_beam> next_beams;
|
4916
5638
|
|
@@ -4920,12 +5642,11 @@ struct llama_beam_search_data {
|
|
4920
5642
|
// Used to communicate to/from callback on beams state.
|
4921
5643
|
std::vector<llama_beam_view> beam_views;
|
4922
5644
|
|
4923
|
-
llama_beam_search_data(llama_context * ctx, size_t n_beams, int n_past, int n_predict
|
5645
|
+
llama_beam_search_data(llama_context * ctx, size_t n_beams, int n_past, int n_predict)
|
4924
5646
|
: ctx(ctx)
|
4925
5647
|
, n_beams(n_beams)
|
4926
5648
|
, n_past(n_past)
|
4927
5649
|
, n_predict(n_predict)
|
4928
|
-
, n_threads(n_threads)
|
4929
5650
|
, beam_views(n_beams) {
|
4930
5651
|
beams.reserve(n_beams);
|
4931
5652
|
next_beams.reserve(n_beams);
|
@@ -4962,7 +5683,7 @@ struct llama_beam_search_data {
|
|
4962
5683
|
} else {
|
4963
5684
|
// beam is not at end-of-sentence, so branch with next top_k tokens.
|
4964
5685
|
if (!beam.tokens.empty()) {
|
4965
|
-
|
5686
|
+
llama_decode(ctx, llama_batch_get_one(beam.tokens.data(), beam.tokens.size(), n_past, 0));
|
4966
5687
|
}
|
4967
5688
|
llama_logit_info logit_info(ctx);
|
4968
5689
|
std::vector<llama_token_data> next_tokens = logit_info.top_k(n_beams);
|
@@ -5036,7 +5757,7 @@ struct llama_beam_search_data {
|
|
5036
5757
|
callback(callback_data, get_beams_state(false)); // Sets common_prefix_length
|
5037
5758
|
update_beams_from_beam_views(); // Update values (p,eob) that callback may have changed.
|
5038
5759
|
if (common_prefix_length) {
|
5039
|
-
|
5760
|
+
llama_decode(ctx, llama_batch_get_one(beams[0].tokens.data(), common_prefix_length, n_past, 0));
|
5040
5761
|
n_past += common_prefix_length;
|
5041
5762
|
}
|
5042
5763
|
// Zero-out next_beam probabilities to place them last in following min-heap.
|
@@ -5077,11 +5798,11 @@ struct llama_beam_search_data {
|
|
5077
5798
|
|
5078
5799
|
void llama_beam_search(llama_context * ctx,
|
5079
5800
|
llama_beam_search_callback_fn_t callback, void * callback_data,
|
5080
|
-
size_t n_beams, int n_past, int n_predict
|
5801
|
+
size_t n_beams, int n_past, int n_predict) {
|
5081
5802
|
assert(ctx);
|
5082
5803
|
const int64_t t_start_sample_us = ggml_time_us();
|
5083
5804
|
|
5084
|
-
llama_beam_search_data beam_search_data(ctx, n_beams, n_past, n_predict
|
5805
|
+
llama_beam_search_data beam_search_data(ctx, n_beams, n_past, n_predict);
|
5085
5806
|
|
5086
5807
|
beam_search_data.loop(callback, callback_data);
|
5087
5808
|
|
@@ -5301,11 +6022,11 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
5301
6022
|
nthread = std::thread::hardware_concurrency();
|
5302
6023
|
}
|
5303
6024
|
|
5304
|
-
|
6025
|
+
llama_model_loader ml(fname_inp, /*use_mmap*/ false);
|
5305
6026
|
|
5306
6027
|
llama_model model;
|
5307
|
-
llm_load_arch(
|
5308
|
-
llm_load_hparams(
|
6028
|
+
llm_load_arch(ml, model);
|
6029
|
+
llm_load_hparams(ml, model);
|
5309
6030
|
|
5310
6031
|
if (params->only_copy) {
|
5311
6032
|
ftype = model.ftype;
|
@@ -5315,7 +6036,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
5315
6036
|
struct gguf_context * ctx_out = gguf_init_empty();
|
5316
6037
|
|
5317
6038
|
// copy the KV pairs from the input file
|
5318
|
-
gguf_set_kv (ctx_out, ml
|
6039
|
+
gguf_set_kv (ctx_out, ml.ctx_gguf);
|
5319
6040
|
gguf_set_val_u32(ctx_out, "general.quantization_version", GGML_QNT_VERSION);
|
5320
6041
|
gguf_set_val_u32(ctx_out, "general.file_type", ftype);
|
5321
6042
|
|
@@ -5323,8 +6044,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
5323
6044
|
int n_attention_wv = 0;
|
5324
6045
|
int n_feed_forward_w2 = 0;
|
5325
6046
|
|
5326
|
-
for (int i = 0; i < ml
|
5327
|
-
struct ggml_tensor * meta = ml
|
6047
|
+
for (int i = 0; i < ml.n_tensors; ++i) {
|
6048
|
+
struct ggml_tensor * meta = ml.get_tensor_meta(i);
|
5328
6049
|
|
5329
6050
|
const std::string name = ggml_get_name(meta);
|
5330
6051
|
|
@@ -5360,8 +6081,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
5360
6081
|
std::vector<no_init<float>> f32_conv_buf;
|
5361
6082
|
|
5362
6083
|
// populate the original tensors so we get an initial meta data
|
5363
|
-
for (int i = 0; i < ml
|
5364
|
-
struct ggml_tensor * meta = ml
|
6084
|
+
for (int i = 0; i < ml.n_tensors; ++i) {
|
6085
|
+
struct ggml_tensor * meta = ml.get_tensor_meta(i);
|
5365
6086
|
gguf_add_tensor(ctx_out, meta);
|
5366
6087
|
}
|
5367
6088
|
|
@@ -5374,8 +6095,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
5374
6095
|
// placeholder for the meta data
|
5375
6096
|
::zeros(fout, meta_size);
|
5376
6097
|
|
5377
|
-
for (int i = 0; i < ml
|
5378
|
-
struct ggml_tensor * tensor = ml
|
6098
|
+
for (int i = 0; i < ml.n_tensors; ++i) {
|
6099
|
+
struct ggml_tensor * tensor = ml.get_tensor_meta(i);
|
5379
6100
|
|
5380
6101
|
const std::string name = ggml_get_name(tensor);
|
5381
6102
|
|
@@ -5383,10 +6104,10 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
5383
6104
|
read_data.resize(ggml_nbytes(tensor));
|
5384
6105
|
}
|
5385
6106
|
tensor->data = read_data.data();
|
5386
|
-
ml
|
6107
|
+
ml.load_data_for(tensor);
|
5387
6108
|
|
5388
6109
|
LLAMA_LOG_INFO("[%4d/%4d] %36s - [%s], type = %6s, ",
|
5389
|
-
++idx, ml
|
6110
|
+
++idx, ml.n_tensors,
|
5390
6111
|
ggml_get_name(tensor),
|
5391
6112
|
llama_format_tensor_shape(tensor).c_str(),
|
5392
6113
|
ggml_type_name(tensor->type));
|
@@ -5536,8 +6257,9 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
5536
6257
|
}
|
5537
6258
|
}
|
5538
6259
|
|
5539
|
-
|
5540
|
-
|
6260
|
+
static int llama_apply_lora_from_file_internal(
|
6261
|
+
const struct llama_model & model, const char * path_lora, float scale, const char * path_base_model, int n_threads
|
6262
|
+
) {
|
5541
6263
|
LLAMA_LOG_INFO("%s: applying lora adapter from '%s' - please wait ...\n", __func__, path_lora);
|
5542
6264
|
|
5543
6265
|
const int64_t t_start_lora_us = ggml_time_us();
|
@@ -5565,7 +6287,7 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const
|
|
5565
6287
|
int32_t lora_alpha;
|
5566
6288
|
fin.read((char *) &lora_r, sizeof(lora_r));
|
5567
6289
|
fin.read((char *) &lora_alpha, sizeof(lora_alpha));
|
5568
|
-
float scaling = (float)lora_alpha / (float)lora_r;
|
6290
|
+
float scaling = scale * (float)lora_alpha / (float)lora_r;
|
5569
6291
|
|
5570
6292
|
LLAMA_LOG_INFO("%s: r = %d, alpha = %d, scaling = %.2f\n", __func__, lora_r, lora_alpha, scaling);
|
5571
6293
|
|
@@ -5781,9 +6503,10 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const
|
|
5781
6503
|
ggml_set_name(r, "r_cpy");
|
5782
6504
|
}
|
5783
6505
|
|
5784
|
-
struct ggml_cgraph gf =
|
6506
|
+
struct ggml_cgraph * gf = ggml_new_graph(lora_ctx);
|
6507
|
+
ggml_build_forward_expand(gf, r);
|
5785
6508
|
|
5786
|
-
ggml_graph_compute_helper(work_buffer,
|
6509
|
+
ggml_graph_compute_helper(work_buffer, gf, n_threads);
|
5787
6510
|
|
5788
6511
|
// we won't need these tensors again, reset the context to save memory
|
5789
6512
|
ggml_free(lora_ctx);
|
@@ -5812,27 +6535,16 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const
|
|
5812
6535
|
//
|
5813
6536
|
// interface implementation
|
5814
6537
|
//
|
5815
|
-
|
5816
|
-
struct
|
5817
|
-
struct llama_context_params result = {
|
5818
|
-
/*.seed =*/ LLAMA_DEFAULT_SEED,
|
5819
|
-
/*.n_ctx =*/ 512,
|
5820
|
-
/*.n_batch =*/ 512,
|
6538
|
+
struct llama_model_params llama_model_default_params() {
|
6539
|
+
struct llama_model_params result = {
|
5821
6540
|
/*.n_gpu_layers =*/ 0,
|
5822
6541
|
/*.main_gpu =*/ 0,
|
5823
6542
|
/*.tensor_split =*/ nullptr,
|
5824
|
-
/*.rope_freq_base =*/ 10000.0f,
|
5825
|
-
/*.rope_freq_scale =*/ 1.0f,
|
5826
6543
|
/*.progress_callback =*/ nullptr,
|
5827
6544
|
/*.progress_callback_user_data =*/ nullptr,
|
5828
|
-
/*.low_vram =*/ false,
|
5829
|
-
/*.mul_mat_q =*/ true,
|
5830
|
-
/*.f16_kv =*/ true,
|
5831
|
-
/*.logits_all =*/ false,
|
5832
6545
|
/*.vocab_only =*/ false,
|
5833
6546
|
/*.use_mmap =*/ true,
|
5834
6547
|
/*.use_mlock =*/ false,
|
5835
|
-
/*.embedding =*/ false,
|
5836
6548
|
};
|
5837
6549
|
|
5838
6550
|
#ifdef GGML_USE_METAL
|
@@ -5842,6 +6554,24 @@ struct llama_context_params llama_context_default_params() {
|
|
5842
6554
|
return result;
|
5843
6555
|
}
|
5844
6556
|
|
6557
|
+
struct llama_context_params llama_context_default_params() {
|
6558
|
+
struct llama_context_params result = {
|
6559
|
+
/*.seed =*/ LLAMA_DEFAULT_SEED,
|
6560
|
+
/*.n_ctx =*/ 512,
|
6561
|
+
/*.n_batch =*/ 512,
|
6562
|
+
/*.n_threads =*/ GGML_DEFAULT_N_THREADS, // TODO: better default
|
6563
|
+
/*.n_threads_batch =*/ GGML_DEFAULT_N_THREADS,
|
6564
|
+
/*.rope_freq_base =*/ 0.0f,
|
6565
|
+
/*.rope_freq_scale =*/ 0.0f,
|
6566
|
+
/*.mul_mat_q =*/ true,
|
6567
|
+
/*.f16_kv =*/ true,
|
6568
|
+
/*.logits_all =*/ false,
|
6569
|
+
/*.embedding =*/ false,
|
6570
|
+
};
|
6571
|
+
|
6572
|
+
return result;
|
6573
|
+
}
|
6574
|
+
|
5845
6575
|
struct llama_model_quantize_params llama_model_quantize_default_params() {
|
5846
6576
|
struct llama_model_quantize_params result = {
|
5847
6577
|
/*.nthread =*/ 0,
|
@@ -5897,13 +6627,11 @@ int64_t llama_time_us(void) {
|
|
5897
6627
|
|
5898
6628
|
struct llama_model * llama_load_model_from_file(
|
5899
6629
|
const char * path_model,
|
5900
|
-
|
6630
|
+
struct llama_model_params params) {
|
5901
6631
|
ggml_time_init();
|
5902
6632
|
|
5903
6633
|
llama_model * model = new llama_model;
|
5904
6634
|
|
5905
|
-
ggml_type memory_type = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32;
|
5906
|
-
|
5907
6635
|
unsigned cur_percentage = 0;
|
5908
6636
|
if (params.progress_callback == NULL) {
|
5909
6637
|
params.progress_callback_user_data = &cur_percentage;
|
@@ -5920,9 +6648,9 @@ struct llama_model * llama_load_model_from_file(
|
|
5920
6648
|
};
|
5921
6649
|
}
|
5922
6650
|
|
5923
|
-
if (!llama_model_load(path_model, *model, params.
|
5924
|
-
params.main_gpu, params.tensor_split,
|
5925
|
-
params.
|
6651
|
+
if (!llama_model_load(path_model, *model, params.n_gpu_layers,
|
6652
|
+
params.main_gpu, params.tensor_split,
|
6653
|
+
params.use_mmap, params.use_mlock, params.vocab_only,
|
5926
6654
|
params.progress_callback, params.progress_callback_user_data)) {
|
5927
6655
|
LLAMA_LOG_ERROR("%s: failed to load model\n", __func__);
|
5928
6656
|
delete model;
|
@@ -5946,18 +6674,33 @@ struct llama_context * llama_new_context_with_model(
|
|
5946
6674
|
|
5947
6675
|
llama_context * ctx = new llama_context(*model);
|
5948
6676
|
|
6677
|
+
const auto & hparams = model->hparams;
|
6678
|
+
auto & cparams = ctx->cparams;
|
6679
|
+
|
6680
|
+
cparams.n_batch = params.n_batch;
|
6681
|
+
cparams.n_ctx = params.n_ctx == 0 ? hparams.n_ctx_train : params.n_ctx;
|
6682
|
+
cparams.rope_freq_base = params.rope_freq_base == 0 ? hparams.rope_freq_base_train : params.rope_freq_base;
|
6683
|
+
cparams.rope_freq_scale = params.rope_freq_scale == 0 ? hparams.rope_freq_scale_train : params.rope_freq_scale;
|
6684
|
+
cparams.n_threads = params.n_threads;
|
6685
|
+
cparams.n_threads_batch = params.n_threads_batch;
|
6686
|
+
cparams.mul_mat_q = params.mul_mat_q;
|
6687
|
+
|
5949
6688
|
if (params.seed == LLAMA_DEFAULT_SEED) {
|
5950
6689
|
params.seed = time(NULL);
|
5951
6690
|
}
|
5952
6691
|
|
6692
|
+
LLAMA_LOG_INFO("%s: n_ctx = %u\n", __func__, cparams.n_ctx);
|
6693
|
+
LLAMA_LOG_INFO("%s: freq_base = %.1f\n", __func__, cparams.rope_freq_base);
|
6694
|
+
LLAMA_LOG_INFO("%s: freq_scale = %g\n", __func__, cparams.rope_freq_scale);
|
6695
|
+
|
5953
6696
|
ctx->rng = std::mt19937(params.seed);
|
5954
6697
|
ctx->logits_all = params.logits_all;
|
5955
6698
|
|
5956
6699
|
ggml_type memory_type = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32;
|
5957
6700
|
|
5958
6701
|
// reserve memory for context buffers
|
5959
|
-
if (!
|
5960
|
-
if (!llama_kv_cache_init(ctx->model.hparams, ctx->kv_self, memory_type,
|
6702
|
+
if (!hparams.vocab_only) {
|
6703
|
+
if (!llama_kv_cache_init(ctx->model.hparams, ctx->kv_self, memory_type, cparams.n_ctx, model->n_gpu_layers)) {
|
5961
6704
|
LLAMA_LOG_ERROR("%s: llama_kv_cache_init() failed for self-attention cache\n", __func__);
|
5962
6705
|
llama_free(ctx);
|
5963
6706
|
return nullptr;
|
@@ -5968,11 +6711,9 @@ struct llama_context * llama_new_context_with_model(
|
|
5968
6711
|
LLAMA_LOG_INFO("%s: kv self size = %7.2f MB\n", __func__, memory_size / 1024.0 / 1024.0);
|
5969
6712
|
}
|
5970
6713
|
|
5971
|
-
const auto & hparams = ctx->model.hparams;
|
5972
|
-
|
5973
6714
|
// resized during inference
|
5974
6715
|
if (params.logits_all) {
|
5975
|
-
ctx->logits.reserve(
|
6716
|
+
ctx->logits.reserve(cparams.n_ctx*hparams.n_vocab);
|
5976
6717
|
} else {
|
5977
6718
|
ctx->logits.reserve(hparams.n_vocab);
|
5978
6719
|
}
|
@@ -5990,26 +6731,28 @@ struct llama_context * llama_new_context_with_model(
|
|
5990
6731
|
ctx->alloc = ggml_allocr_new_measure(tensor_alignment);
|
5991
6732
|
|
5992
6733
|
// build worst-case graph
|
5993
|
-
int n_tokens = std::min(
|
5994
|
-
int n_past =
|
6734
|
+
int n_tokens = (int)std::min(cparams.n_ctx, cparams.n_batch);
|
6735
|
+
int n_past = cparams.n_ctx - n_tokens;
|
5995
6736
|
llama_token token = llama_token_bos(ctx); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph
|
5996
|
-
ggml_cgraph * gf = llama_build_graph(*ctx, &token,
|
6737
|
+
ggml_cgraph * gf = llama_build_graph(*ctx, llama_batch_get_one(&token, n_tokens, n_past, 0));
|
6738
|
+
|
5997
6739
|
#ifdef GGML_USE_METAL
|
5998
|
-
if (
|
6740
|
+
if (model->n_gpu_layers > 0) {
|
5999
6741
|
ctx->ctx_metal = ggml_metal_init(1);
|
6000
6742
|
if (!ctx->ctx_metal) {
|
6001
6743
|
LLAMA_LOG_ERROR("%s: ggml_metal_init() failed\n", __func__);
|
6002
6744
|
llama_free(ctx);
|
6003
6745
|
return NULL;
|
6004
6746
|
}
|
6005
|
-
|
6006
|
-
|
6747
|
+
ggml_metal_log_set_callback(llama_log_callback_default, NULL);
|
6748
|
+
//ggml_metal_graph_find_concurrency(ctx->ctx_metal, gf, false);
|
6749
|
+
//ggml_allocr_set_parse_seq(ctx->alloc, ggml_metal_get_concur_list(ctx->ctx_metal), ggml_metal_if_optimized(ctx->ctx_metal));
|
6007
6750
|
}
|
6008
6751
|
#endif
|
6009
6752
|
// measure memory requirements for the graph
|
6010
6753
|
size_t alloc_size = ggml_allocr_alloc_graph(ctx->alloc, gf) + tensor_alignment;
|
6011
6754
|
|
6012
|
-
LLAMA_LOG_INFO("%s: compute buffer total size =
|
6755
|
+
LLAMA_LOG_INFO("%s: compute buffer total size = %.2f MB\n", __func__, (ctx->buf_compute.size + alloc_size) / 1024.0 / 1024.0);
|
6013
6756
|
|
6014
6757
|
// recreate allocator with exact memory requirements
|
6015
6758
|
ggml_allocr_free(ctx->alloc);
|
@@ -6018,28 +6761,46 @@ struct llama_context * llama_new_context_with_model(
|
|
6018
6761
|
ctx->alloc = ggml_allocr_new(ctx->buf_alloc.data, ctx->buf_alloc.size, tensor_alignment);
|
6019
6762
|
#ifdef GGML_USE_METAL
|
6020
6763
|
if (ctx->ctx_metal) {
|
6021
|
-
ggml_allocr_set_parse_seq(ctx->alloc, ggml_metal_get_concur_list(ctx->ctx_metal), ggml_metal_if_optimized(ctx->ctx_metal));
|
6764
|
+
//ggml_allocr_set_parse_seq(ctx->alloc, ggml_metal_get_concur_list(ctx->ctx_metal), ggml_metal_if_optimized(ctx->ctx_metal));
|
6022
6765
|
}
|
6023
6766
|
#endif
|
6024
6767
|
#ifdef GGML_USE_CUBLAS
|
6025
|
-
|
6026
|
-
|
6027
|
-
|
6028
|
-
|
6029
|
-
|
6030
|
-
|
6768
|
+
ggml_cuda_set_scratch_size(alloc_size);
|
6769
|
+
LLAMA_LOG_INFO("%s: VRAM scratch buffer: %.2f MB\n", __func__, alloc_size / 1024.0 / 1024.0);
|
6770
|
+
|
6771
|
+
// calculate total VRAM usage
|
6772
|
+
auto add_tensor = [](const ggml_tensor * t, size_t & size) {
|
6773
|
+
if (t->backend == GGML_BACKEND_GPU || t->backend == GGML_BACKEND_GPU_SPLIT) {
|
6774
|
+
size += ggml_nbytes(t);
|
6775
|
+
}
|
6776
|
+
};
|
6777
|
+
size_t model_vram_size = 0;
|
6778
|
+
for (const auto & kv : model->tensors_by_name) {
|
6779
|
+
add_tensor(kv.second, model_vram_size);
|
6031
6780
|
}
|
6781
|
+
|
6782
|
+
size_t kv_vram_size = 0;
|
6783
|
+
add_tensor(ctx->kv_self.k, kv_vram_size);
|
6784
|
+
add_tensor(ctx->kv_self.v, kv_vram_size);
|
6785
|
+
|
6786
|
+
size_t ctx_vram_size = alloc_size + kv_vram_size;
|
6787
|
+
size_t total_vram_size = model_vram_size + ctx_vram_size;
|
6788
|
+
|
6789
|
+
LLAMA_LOG_INFO("%s: total VRAM used: %.2f MB (model: %.2f MB, context: %.2f MB)\n", __func__,
|
6790
|
+
total_vram_size / 1024.0 / 1024.0,
|
6791
|
+
model_vram_size / 1024.0 / 1024.0,
|
6792
|
+
ctx_vram_size / 1024.0 / 1024.0);
|
6032
6793
|
#endif
|
6033
6794
|
}
|
6034
6795
|
|
6035
6796
|
#ifdef GGML_USE_METAL
|
6036
|
-
if (
|
6797
|
+
if (model->n_gpu_layers > 0) {
|
6037
6798
|
// this allocates all Metal resources and memory buffers
|
6038
6799
|
|
6039
6800
|
void * data_ptr = NULL;
|
6040
6801
|
size_t data_size = 0;
|
6041
6802
|
|
6042
|
-
if (
|
6803
|
+
if (ctx->model.mapping) {
|
6043
6804
|
data_ptr = ctx->model.mapping->addr;
|
6044
6805
|
data_size = ctx->model.mapping->size;
|
6045
6806
|
} else {
|
@@ -6058,11 +6819,8 @@ struct llama_context * llama_new_context_with_model(
|
|
6058
6819
|
return NULL; \
|
6059
6820
|
}
|
6060
6821
|
|
6061
|
-
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "data",
|
6062
|
-
|
6063
|
-
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "eval", ctx->buf_compute.data, ctx->buf_compute.size, 0));
|
6064
|
-
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "kv", ctx->kv_self.buf.data, ctx->kv_self.buf.size, 0));
|
6065
|
-
|
6822
|
+
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "data", data_ptr, data_size, max_size));
|
6823
|
+
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "kv", ctx->kv_self.buf.data, ctx->kv_self.buf.size, 0));
|
6066
6824
|
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "alloc", ctx->buf_alloc.data, ctx->buf_alloc.size, 0));
|
6067
6825
|
#undef LLAMA_METAL_CHECK_BUF
|
6068
6826
|
}
|
@@ -6074,8 +6832,10 @@ struct llama_context * llama_new_context_with_model(
|
|
6074
6832
|
|
6075
6833
|
if (ggml_mpi_rank(ctx->ctx_mpi) > 0) {
|
6076
6834
|
// Enter a blocking eval loop with dummy input, letting rank=0 drive the process
|
6077
|
-
|
6078
|
-
|
6835
|
+
// TODO: needs fix after #3228
|
6836
|
+
GGML_ASSERT(false && "not implemented");
|
6837
|
+
//const std::vector<llama_token> tmp(ctx->model.hparams.n_ctx, llama_token_bos(ctx));
|
6838
|
+
//while (!llama_eval(ctx, tmp.data(), tmp.size(), 0, 0)) {};
|
6079
6839
|
llama_backend_free();
|
6080
6840
|
exit(1);
|
6081
6841
|
}
|
@@ -6084,63 +6844,37 @@ struct llama_context * llama_new_context_with_model(
|
|
6084
6844
|
return ctx;
|
6085
6845
|
}
|
6086
6846
|
|
6087
|
-
struct llama_context * llama_init_from_file(
|
6088
|
-
const char * path_model,
|
6089
|
-
struct llama_context_params params) {
|
6090
|
-
struct llama_model * model = llama_load_model_from_file(path_model, params);
|
6091
|
-
if (!model) {
|
6092
|
-
return nullptr;
|
6093
|
-
}
|
6094
|
-
|
6095
|
-
struct llama_context * ctx = llama_new_context_with_model(model, params);
|
6096
|
-
ctx->model_owner = true;
|
6097
|
-
|
6098
|
-
return ctx;
|
6099
|
-
}
|
6100
|
-
|
6101
6847
|
void llama_free(struct llama_context * ctx) {
|
6102
6848
|
delete ctx;
|
6103
6849
|
}
|
6104
6850
|
|
6105
|
-
|
6106
|
-
return
|
6851
|
+
const llama_model * llama_get_model(const struct llama_context * ctx) {
|
6852
|
+
return &ctx->model;
|
6107
6853
|
}
|
6108
6854
|
|
6109
6855
|
int llama_n_ctx(const struct llama_context * ctx) {
|
6110
|
-
return
|
6111
|
-
}
|
6112
|
-
|
6113
|
-
int llama_n_ctx_train(const struct llama_context * ctx) {
|
6114
|
-
return llama_model_n_ctx_train(&ctx->model);
|
6115
|
-
}
|
6116
|
-
|
6117
|
-
int llama_n_embd(const struct llama_context * ctx) {
|
6118
|
-
return llama_model_n_embd(&ctx->model);
|
6856
|
+
return ctx->cparams.n_ctx;
|
6119
6857
|
}
|
6120
6858
|
|
6121
|
-
enum llama_vocab_type llama_vocab_type(const struct
|
6122
|
-
return
|
6859
|
+
enum llama_vocab_type llama_vocab_type(const struct llama_model * model) {
|
6860
|
+
return model->vocab.type;
|
6123
6861
|
}
|
6124
6862
|
|
6125
|
-
int
|
6863
|
+
int llama_n_vocab(const struct llama_model * model) {
|
6126
6864
|
return model->vocab.id_to_token.size();
|
6127
6865
|
}
|
6128
6866
|
|
6129
|
-
int
|
6130
|
-
return model->hparams.n_ctx;
|
6131
|
-
}
|
6132
|
-
|
6133
|
-
int llama_model_n_ctx_train(const struct llama_model * model) {
|
6867
|
+
int llama_n_ctx_train(const struct llama_model * model) {
|
6134
6868
|
return model->hparams.n_ctx_train;
|
6135
6869
|
}
|
6136
6870
|
|
6137
|
-
int
|
6871
|
+
int llama_n_embd(const struct llama_model * model) {
|
6138
6872
|
return model->hparams.n_embd;
|
6139
6873
|
}
|
6140
6874
|
|
6141
6875
|
int llama_model_desc(const struct llama_model * model, char * buf, size_t buf_size) {
|
6142
6876
|
return snprintf(buf, buf_size, "%s %s %s",
|
6143
|
-
model->
|
6877
|
+
llama_model_arch_name(model->arch).c_str(),
|
6144
6878
|
llama_model_type_name(model->type),
|
6145
6879
|
llama_model_ftype_name(model->ftype).c_str());
|
6146
6880
|
}
|
@@ -6161,6 +6895,10 @@ uint64_t llama_model_n_params(const struct llama_model * model) {
|
|
6161
6895
|
return nparams;
|
6162
6896
|
}
|
6163
6897
|
|
6898
|
+
struct ggml_tensor * llama_get_model_tensor(struct llama_model * model, const char * name) {
|
6899
|
+
return ggml_get_tensor(model->ctx, name);
|
6900
|
+
}
|
6901
|
+
|
6164
6902
|
int llama_model_quantize(
|
6165
6903
|
const char * fname_inp,
|
6166
6904
|
const char * fname_out,
|
@@ -6174,18 +6912,18 @@ int llama_model_quantize(
|
|
6174
6912
|
}
|
6175
6913
|
}
|
6176
6914
|
|
6177
|
-
int llama_apply_lora_from_file(struct llama_context * ctx, const char * path_lora, const char * path_base_model, int n_threads) {
|
6915
|
+
int llama_apply_lora_from_file(struct llama_context * ctx, const char * path_lora, float scale, const char * path_base_model, int n_threads) {
|
6178
6916
|
try {
|
6179
|
-
return llama_apply_lora_from_file_internal(ctx->model, path_lora, path_base_model, n_threads);
|
6917
|
+
return llama_apply_lora_from_file_internal(ctx->model, path_lora, scale, path_base_model, n_threads);
|
6180
6918
|
} catch (const std::exception & err) {
|
6181
6919
|
LLAMA_LOG_ERROR("%s: failed to apply lora adapter: %s\n", __func__, err.what());
|
6182
6920
|
return 1;
|
6183
6921
|
}
|
6184
6922
|
}
|
6185
6923
|
|
6186
|
-
int llama_model_apply_lora_from_file(const struct llama_model * model, const char * path_lora, const char * path_base_model, int n_threads) {
|
6924
|
+
int llama_model_apply_lora_from_file(const struct llama_model * model, const char * path_lora, float scale, const char * path_base_model, int n_threads) {
|
6187
6925
|
try {
|
6188
|
-
return llama_apply_lora_from_file_internal(*model, path_lora, path_base_model, n_threads);
|
6926
|
+
return llama_apply_lora_from_file_internal(*model, path_lora, scale, path_base_model, n_threads);
|
6189
6927
|
} catch (const std::exception & err) {
|
6190
6928
|
LLAMA_LOG_ERROR("%s: failed to apply lora adapter: %s\n", __func__, err.what());
|
6191
6929
|
return 1;
|
@@ -6193,16 +6931,27 @@ int llama_model_apply_lora_from_file(const struct llama_model * model, const cha
|
|
6193
6931
|
}
|
6194
6932
|
|
6195
6933
|
int llama_get_kv_cache_token_count(const struct llama_context * ctx) {
|
6196
|
-
return ctx->kv_self.
|
6934
|
+
return ctx->kv_self.head;
|
6197
6935
|
}
|
6198
6936
|
|
6199
|
-
|
6937
|
+
void llama_kv_cache_tokens_rm(struct llama_context * ctx, int32_t c0, int32_t c1) {
|
6938
|
+
llama_kv_cache_tokens_rm(ctx->kv_self, c0, c1);
|
6939
|
+
}
|
6200
6940
|
|
6201
|
-
void
|
6202
|
-
|
6203
|
-
|
6204
|
-
|
6205
|
-
|
6941
|
+
void llama_kv_cache_seq_rm(struct llama_context * ctx, llama_seq_id seq_id, llama_pos p0, llama_pos p1) {
|
6942
|
+
llama_kv_cache_seq_rm(ctx->kv_self, seq_id, p0, p1);
|
6943
|
+
}
|
6944
|
+
|
6945
|
+
void llama_kv_cache_seq_cp(struct llama_context * ctx, llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) {
|
6946
|
+
llama_kv_cache_seq_cp(ctx->kv_self, seq_id_src, seq_id_dst, p0, p1);
|
6947
|
+
}
|
6948
|
+
|
6949
|
+
void llama_kv_cache_seq_keep(struct llama_context * ctx, llama_seq_id seq_id) {
|
6950
|
+
llama_kv_cache_seq_keep(ctx->kv_self, seq_id);
|
6951
|
+
}
|
6952
|
+
|
6953
|
+
void llama_kv_cache_seq_shift(struct llama_context * ctx, llama_seq_id seq_id, llama_pos p0, llama_pos p1, llama_pos delta) {
|
6954
|
+
llama_kv_cache_seq_shift(ctx->kv_self, seq_id, p0, p1, delta);
|
6206
6955
|
}
|
6207
6956
|
|
6208
6957
|
// Returns the *maximum* size of the state
|
@@ -6289,7 +7038,17 @@ struct llama_data_file_context : llama_data_context {
|
|
6289
7038
|
* llama_copy_state_data(ctx, &data_ctx);
|
6290
7039
|
*
|
6291
7040
|
*/
|
6292
|
-
void llama_copy_state_data_internal(struct llama_context * ctx, llama_data_context * data_ctx) {
|
7041
|
+
static void llama_copy_state_data_internal(struct llama_context * ctx, llama_data_context * data_ctx) {
|
7042
|
+
// TODO: does not support multi-sequence states
|
7043
|
+
{
|
7044
|
+
const auto & kv_self = ctx->kv_self;
|
7045
|
+
for (uint32_t i = 0; i < kv_self.head; ++i) {
|
7046
|
+
GGML_ASSERT(kv_self.cells[i].pos == (int32_t) i);
|
7047
|
+
GGML_ASSERT(kv_self.cells[i].seq_id.size() == 1);
|
7048
|
+
GGML_ASSERT(kv_self.cells[i].has_seq_id(0));
|
7049
|
+
}
|
7050
|
+
}
|
7051
|
+
|
6293
7052
|
// copy rng
|
6294
7053
|
{
|
6295
7054
|
std::stringstream rng_ss;
|
@@ -6340,12 +7099,14 @@ void llama_copy_state_data_internal(struct llama_context * ctx, llama_data_conte
|
|
6340
7099
|
{
|
6341
7100
|
const auto & kv_self = ctx->kv_self;
|
6342
7101
|
const auto & hparams = ctx->model.hparams;
|
7102
|
+
const auto & cparams = ctx->cparams;
|
7103
|
+
|
6343
7104
|
const int n_layer = hparams.n_layer;
|
6344
7105
|
const int n_embd = hparams.n_embd_gqa();
|
6345
|
-
const int n_ctx =
|
7106
|
+
const int n_ctx = cparams.n_ctx;
|
6346
7107
|
|
6347
7108
|
const size_t kv_size = kv_self.buf.size;
|
6348
|
-
const int kv_ntok =
|
7109
|
+
const int kv_ntok = kv_self.head;
|
6349
7110
|
|
6350
7111
|
data_ctx->write(&kv_size, sizeof(kv_size));
|
6351
7112
|
data_ctx->write(&kv_ntok, sizeof(kv_ntok));
|
@@ -6448,9 +7209,11 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
|
|
6448
7209
|
{
|
6449
7210
|
const auto & kv_self = ctx->kv_self;
|
6450
7211
|
const auto & hparams = ctx->model.hparams;
|
7212
|
+
const auto & cparams = ctx->cparams;
|
7213
|
+
|
6451
7214
|
const int n_layer = hparams.n_layer;
|
6452
7215
|
const int n_embd = hparams.n_embd_gqa();
|
6453
|
-
const int n_ctx =
|
7216
|
+
const int n_ctx = cparams.n_ctx;
|
6454
7217
|
|
6455
7218
|
size_t kv_size;
|
6456
7219
|
int kv_ntok;
|
@@ -6489,7 +7252,8 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
|
|
6489
7252
|
ggml_free(cpy_ctx);
|
6490
7253
|
}
|
6491
7254
|
|
6492
|
-
ctx->kv_self.
|
7255
|
+
ctx->kv_self.head = kv_ntok;
|
7256
|
+
ctx->kv_self.size = kv_size;
|
6493
7257
|
}
|
6494
7258
|
|
6495
7259
|
const size_t nread = inp - src;
|
@@ -6584,64 +7348,102 @@ bool llama_save_session_file(struct llama_context * ctx, const char * path_sessi
|
|
6584
7348
|
|
6585
7349
|
int llama_eval(
|
6586
7350
|
struct llama_context * ctx,
|
6587
|
-
|
6588
|
-
|
6589
|
-
int n_past
|
6590
|
-
|
6591
|
-
if (!llama_eval_internal(*ctx, tokens, nullptr, n_tokens, n_past, n_threads, nullptr)) {
|
6592
|
-
LLAMA_LOG_ERROR("%s: failed to eval\n", __func__);
|
6593
|
-
return 1;
|
6594
|
-
}
|
7351
|
+
llama_token * tokens,
|
7352
|
+
int32_t n_tokens,
|
7353
|
+
int n_past) {
|
7354
|
+
llama_kv_cache_tokens_rm(ctx->kv_self, n_past, -1);
|
6595
7355
|
|
6596
|
-
|
6597
|
-
|
6598
|
-
|
6599
|
-
ctx->t_load_us = ggml_time_us() - ctx->t_start_us;
|
6600
|
-
ctx->has_evaluated_once = true;
|
7356
|
+
const int ret = llama_decode_internal(*ctx, llama_batch_get_one(tokens, n_tokens, n_past, 0));
|
7357
|
+
if (ret < 0) {
|
7358
|
+
LLAMA_LOG_ERROR("%s: failed to decode, ret = %d\n", __func__, ret);
|
6601
7359
|
}
|
6602
7360
|
|
6603
|
-
return
|
7361
|
+
return ret;
|
6604
7362
|
}
|
6605
7363
|
|
6606
7364
|
int llama_eval_embd(
|
6607
7365
|
struct llama_context * ctx,
|
6608
|
-
|
6609
|
-
|
6610
|
-
int n_past
|
6611
|
-
|
6612
|
-
if (!llama_eval_internal(*ctx, nullptr, embd, n_tokens, n_past, n_threads, nullptr)) {
|
6613
|
-
LLAMA_LOG_ERROR("%s: failed to eval\n", __func__);
|
6614
|
-
return 1;
|
6615
|
-
}
|
7366
|
+
float * embd,
|
7367
|
+
int32_t n_tokens,
|
7368
|
+
int n_past) {
|
7369
|
+
llama_kv_cache_tokens_rm(ctx->kv_self, n_past, -1);
|
6616
7370
|
|
6617
|
-
|
6618
|
-
|
6619
|
-
|
6620
|
-
|
6621
|
-
|
7371
|
+
llama_batch batch = { n_tokens, nullptr, embd, nullptr, nullptr, nullptr, n_past, 1, 0, };
|
7372
|
+
|
7373
|
+
const int ret = llama_decode_internal(*ctx, batch);
|
7374
|
+
if (ret < 0) {
|
7375
|
+
LLAMA_LOG_ERROR("%s: failed to decode, ret = %d\n", __func__, ret);
|
6622
7376
|
}
|
6623
7377
|
|
6624
|
-
return
|
7378
|
+
return ret;
|
6625
7379
|
}
|
6626
7380
|
|
6627
|
-
|
6628
|
-
|
6629
|
-
|
7381
|
+
void llama_set_n_threads(struct llama_context * ctx, uint32_t n_threads, uint32_t n_threads_batch) {
|
7382
|
+
ctx->cparams.n_threads = n_threads;
|
7383
|
+
ctx->cparams.n_threads_batch = n_threads_batch;
|
7384
|
+
}
|
7385
|
+
|
7386
|
+
struct llama_batch llama_batch_get_one(
|
7387
|
+
llama_token * tokens,
|
7388
|
+
int32_t n_tokens,
|
7389
|
+
llama_pos pos_0,
|
7390
|
+
llama_seq_id seq_id) {
|
7391
|
+
return {
|
7392
|
+
/*n_tokens =*/ n_tokens,
|
7393
|
+
/*tokens =*/ tokens,
|
7394
|
+
/*embd =*/ nullptr,
|
7395
|
+
/*pos =*/ nullptr,
|
7396
|
+
/*seq_id =*/ nullptr,
|
7397
|
+
/*logits =*/ nullptr,
|
7398
|
+
/*all_pos_0 =*/ pos_0,
|
7399
|
+
/*all_pos_1 =*/ 1,
|
7400
|
+
/*all_seq_id =*/ seq_id,
|
7401
|
+
};
|
7402
|
+
}
|
6630
7403
|
|
6631
|
-
|
7404
|
+
struct llama_batch llama_batch_init(int32_t n_tokens, int32_t embd) {
|
7405
|
+
llama_batch batch = { -1, nullptr, nullptr, nullptr, nullptr, nullptr, 0, 0, 0, };
|
6632
7406
|
|
6633
|
-
if (
|
6634
|
-
|
6635
|
-
|
7407
|
+
if (embd) {
|
7408
|
+
batch.embd = (float *) malloc(sizeof(float) * n_tokens * embd);
|
7409
|
+
} else {
|
7410
|
+
batch.token = (llama_token *) malloc(sizeof(llama_token) * n_tokens);
|
6636
7411
|
}
|
6637
7412
|
|
6638
|
-
|
7413
|
+
batch.pos = (llama_pos *) malloc(sizeof(llama_pos) * n_tokens);
|
7414
|
+
batch.seq_id = (llama_seq_id *) malloc(sizeof(llama_seq_id) * n_tokens);
|
7415
|
+
batch.logits = (int8_t *) malloc(sizeof(int8_t) * n_tokens);
|
7416
|
+
|
7417
|
+
return batch;
|
7418
|
+
}
|
7419
|
+
|
7420
|
+
void llama_batch_free(struct llama_batch batch) {
|
7421
|
+
if (batch.token) free(batch.token);
|
7422
|
+
if (batch.embd) free(batch.embd);
|
7423
|
+
if (batch.pos) free(batch.pos);
|
7424
|
+
if (batch.seq_id) free(batch.seq_id);
|
7425
|
+
if (batch.logits) free(batch.logits);
|
7426
|
+
}
|
7427
|
+
|
7428
|
+
int llama_decode(
|
7429
|
+
struct llama_context * ctx,
|
7430
|
+
struct llama_batch batch) {
|
7431
|
+
const int ret = llama_decode_internal(*ctx, batch);
|
7432
|
+
if (ret < 0) {
|
7433
|
+
LLAMA_LOG_ERROR("%s: failed to decode, ret = %d\n", __func__, ret);
|
7434
|
+
}
|
7435
|
+
|
7436
|
+
return ret;
|
6639
7437
|
}
|
6640
7438
|
|
6641
7439
|
float * llama_get_logits(struct llama_context * ctx) {
|
6642
7440
|
return ctx->logits.data();
|
6643
7441
|
}
|
6644
7442
|
|
7443
|
+
float * llama_get_logits_ith(struct llama_context * ctx, int32_t i) {
|
7444
|
+
return ctx->logits.data() + i*ctx->model.hparams.n_vocab;
|
7445
|
+
}
|
7446
|
+
|
6645
7447
|
float * llama_get_embeddings(struct llama_context * ctx) {
|
6646
7448
|
return ctx->embedding.data();
|
6647
7449
|
}
|
@@ -6671,21 +7473,13 @@ llama_token llama_token_nl(const struct llama_context * ctx) {
|
|
6671
7473
|
}
|
6672
7474
|
|
6673
7475
|
int llama_tokenize(
|
6674
|
-
struct llama_context * ctx,
|
6675
|
-
const char * text,
|
6676
|
-
llama_token * tokens,
|
6677
|
-
int n_max_tokens,
|
6678
|
-
bool add_bos) {
|
6679
|
-
return llama_tokenize_with_model(&ctx->model, text, tokens, n_max_tokens, add_bos);
|
6680
|
-
}
|
6681
|
-
|
6682
|
-
int llama_tokenize_with_model(
|
6683
7476
|
const struct llama_model * model,
|
6684
7477
|
const char * text,
|
7478
|
+
int text_len,
|
6685
7479
|
llama_token * tokens,
|
6686
7480
|
int n_max_tokens,
|
6687
7481
|
bool add_bos) {
|
6688
|
-
auto res = llama_tokenize_internal(model->vocab, text, add_bos);
|
7482
|
+
auto res = llama_tokenize_internal(model->vocab, std::string(text, text_len), add_bos);
|
6689
7483
|
|
6690
7484
|
if (n_max_tokens < (int) res.size()) {
|
6691
7485
|
// LLAMA_LOG_ERROR("%s: too many tokens\n", __func__);
|
@@ -6699,13 +7493,9 @@ int llama_tokenize_with_model(
|
|
6699
7493
|
return res.size();
|
6700
7494
|
}
|
6701
7495
|
|
6702
|
-
int llama_token_to_piece(const struct llama_context * ctx, llama_token token, char * buf, int length) {
|
6703
|
-
return llama_token_to_piece_with_model(&ctx->model, token, buf, length);
|
6704
|
-
}
|
6705
|
-
|
6706
7496
|
// does not write null-terminator to buf
|
6707
|
-
int
|
6708
|
-
if (0 <= token && token <
|
7497
|
+
int llama_token_to_piece(const struct llama_model * model, llama_token token, char * buf, int length) {
|
7498
|
+
if (0 <= token && token < llama_n_vocab(model)) {
|
6709
7499
|
if (llama_is_normal_token(model->vocab, token)) {
|
6710
7500
|
std::string result = model->vocab.id_to_token[token].text;
|
6711
7501
|
if (llama_vocab_get_type(model->vocab) == LLAMA_VOCAB_TYPE_SPM) {
|
@@ -6725,7 +7515,7 @@ int llama_token_to_piece_with_model(const struct llama_model * model, llama_toke
|
|
6725
7515
|
buf[2] = '\x85';
|
6726
7516
|
return 3;
|
6727
7517
|
} else if (llama_is_control_token(model->vocab, token)) {
|
6728
|
-
|
7518
|
+
// do nothing
|
6729
7519
|
} else if (llama_is_byte_token(model->vocab, token)) {
|
6730
7520
|
if (length < 1) {
|
6731
7521
|
return -1;
|
@@ -6827,16 +7617,18 @@ void llama_dump_timing_info_yaml(FILE * stream, const llama_context * ctx) {
|
|
6827
7617
|
}
|
6828
7618
|
|
6829
7619
|
// For internal test use
|
6830
|
-
const std::vector<std::pair<std::string, struct ggml_tensor
|
7620
|
+
const std::vector<std::pair<std::string, struct ggml_tensor *>> & llama_internal_get_tensor_map(
|
7621
|
+
struct llama_context * ctx
|
7622
|
+
) {
|
6831
7623
|
return ctx->model.tensors_by_name;
|
6832
7624
|
}
|
6833
7625
|
|
6834
|
-
void llama_log_set(
|
7626
|
+
void llama_log_set(ggml_log_callback log_callback, void * user_data) {
|
6835
7627
|
g_state.log_callback = log_callback ? log_callback : llama_log_callback_default;
|
6836
7628
|
g_state.log_callback_user_data = user_data;
|
6837
7629
|
}
|
6838
7630
|
|
6839
|
-
static void llama_log_internal_v(
|
7631
|
+
static void llama_log_internal_v(ggml_log_level level, const char * format, va_list args) {
|
6840
7632
|
va_list args_copy;
|
6841
7633
|
va_copy(args_copy, args);
|
6842
7634
|
char buffer[128];
|
@@ -6853,14 +7645,14 @@ static void llama_log_internal_v(llama_log_level level, const char * format, va_
|
|
6853
7645
|
va_end(args_copy);
|
6854
7646
|
}
|
6855
7647
|
|
6856
|
-
static void llama_log_internal(
|
7648
|
+
static void llama_log_internal(ggml_log_level level, const char * format, ...) {
|
6857
7649
|
va_list args;
|
6858
7650
|
va_start(args, format);
|
6859
7651
|
llama_log_internal_v(level, format, args);
|
6860
7652
|
va_end(args);
|
6861
7653
|
}
|
6862
7654
|
|
6863
|
-
static void llama_log_callback_default(
|
7655
|
+
static void llama_log_callback_default(ggml_log_level level, const char * text, void * user_data) {
|
6864
7656
|
(void) level;
|
6865
7657
|
(void) user_data;
|
6866
7658
|
fputs(text, stderr);
|