llama_cpp 0.5.2 → 0.6.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +13 -0
- data/README.md +6 -5
- data/examples/chat.rb +13 -13
- data/examples/embedding.rb +9 -9
- data/ext/llama_cpp/llama_cpp.cpp +547 -272
- data/ext/llama_cpp/src/ggml-alloc.c +14 -8
- data/ext/llama_cpp/src/ggml-alloc.h +1 -0
- data/ext/llama_cpp/src/ggml-cuda.cu +307 -127
- data/ext/llama_cpp/src/ggml-cuda.h +1 -0
- data/ext/llama_cpp/src/ggml-metal.h +4 -0
- data/ext/llama_cpp/src/ggml-metal.m +200 -94
- data/ext/llama_cpp/src/ggml-metal.metal +264 -82
- data/ext/llama_cpp/src/ggml-opencl.cpp +3 -3
- data/ext/llama_cpp/src/ggml.c +1647 -865
- data/ext/llama_cpp/src/ggml.h +143 -52
- data/ext/llama_cpp/src/llama.cpp +1427 -635
- data/ext/llama_cpp/src/llama.h +308 -119
- data/lib/llama_cpp/version.rb +2 -2
- data/lib/llama_cpp.rb +5 -9
- data/sig/llama_cpp.rbs +65 -34
- metadata +3 -3
data/ext/llama_cpp/src/llama.cpp
CHANGED
@@ -1,3 +1,4 @@
|
|
1
|
+
#define LLAMA_API_INTERNAL
|
1
2
|
#include "llama.h"
|
2
3
|
|
3
4
|
#include "ggml.h"
|
@@ -71,6 +72,7 @@
|
|
71
72
|
#include <sstream>
|
72
73
|
#include <thread>
|
73
74
|
#include <unordered_map>
|
75
|
+
#include <set>
|
74
76
|
|
75
77
|
#if defined(_MSC_VER)
|
76
78
|
#pragma warning(disable: 4244 4267) // possible loss of data
|
@@ -91,12 +93,12 @@
|
|
91
93
|
//
|
92
94
|
|
93
95
|
LLAMA_ATTRIBUTE_FORMAT(2, 3)
|
94
|
-
static void llama_log_internal (
|
95
|
-
static void llama_log_callback_default(
|
96
|
+
static void llama_log_internal (ggml_log_level level, const char* format, ...);
|
97
|
+
static void llama_log_callback_default(ggml_log_level level, const char * text, void * user_data);
|
96
98
|
|
97
|
-
#define LLAMA_LOG_INFO(...) llama_log_internal(
|
98
|
-
#define LLAMA_LOG_WARN(...) llama_log_internal(
|
99
|
-
#define LLAMA_LOG_ERROR(...) llama_log_internal(
|
99
|
+
#define LLAMA_LOG_INFO(...) llama_log_internal(GGML_LOG_LEVEL_INFO , __VA_ARGS__)
|
100
|
+
#define LLAMA_LOG_WARN(...) llama_log_internal(GGML_LOG_LEVEL_WARN , __VA_ARGS__)
|
101
|
+
#define LLAMA_LOG_ERROR(...) llama_log_internal(GGML_LOG_LEVEL_ERROR, __VA_ARGS__)
|
100
102
|
|
101
103
|
//
|
102
104
|
// helpers
|
@@ -108,7 +110,7 @@ static size_t utf8_len(char src) {
|
|
108
110
|
return lookup[highbits];
|
109
111
|
}
|
110
112
|
|
111
|
-
void replace_all(std::string & s, const std::string & search, const std::string & replace) {
|
113
|
+
static void replace_all(std::string & s, const std::string & search, const std::string & replace) {
|
112
114
|
std::string result;
|
113
115
|
for (size_t pos = 0; ; pos += search.length()) {
|
114
116
|
auto new_pos = s.find(search, pos);
|
@@ -160,17 +162,19 @@ enum llm_arch {
|
|
160
162
|
LLM_ARCH_GPTJ,
|
161
163
|
LLM_ARCH_GPTNEOX,
|
162
164
|
LLM_ARCH_MPT,
|
165
|
+
LLM_ARCH_STARCODER,
|
163
166
|
LLM_ARCH_UNKNOWN,
|
164
167
|
};
|
165
168
|
|
166
169
|
static std::map<llm_arch, std::string> LLM_ARCH_NAMES = {
|
167
|
-
{ LLM_ARCH_LLAMA,
|
168
|
-
{ LLM_ARCH_FALCON,
|
169
|
-
{ LLM_ARCH_GPT2,
|
170
|
-
{ LLM_ARCH_GPTJ,
|
171
|
-
{ LLM_ARCH_GPTNEOX,
|
172
|
-
{ LLM_ARCH_MPT,
|
173
|
-
{ LLM_ARCH_BAICHUAN,"baichuan"
|
170
|
+
{ LLM_ARCH_LLAMA, "llama" },
|
171
|
+
{ LLM_ARCH_FALCON, "falcon" },
|
172
|
+
{ LLM_ARCH_GPT2, "gpt2" },
|
173
|
+
{ LLM_ARCH_GPTJ, "gptj" },
|
174
|
+
{ LLM_ARCH_GPTNEOX, "gptneox" },
|
175
|
+
{ LLM_ARCH_MPT, "mpt" },
|
176
|
+
{ LLM_ARCH_BAICHUAN, "baichuan" },
|
177
|
+
{ LLM_ARCH_STARCODER, "starcoder" },
|
174
178
|
};
|
175
179
|
|
176
180
|
enum llm_kv {
|
@@ -218,16 +222,16 @@ enum llm_kv {
|
|
218
222
|
};
|
219
223
|
|
220
224
|
static std::map<llm_kv, std::string> LLM_KV_NAMES = {
|
221
|
-
{ LLM_KV_GENERAL_ARCHITECTURE, "general.architecture"
|
222
|
-
{ LLM_KV_GENERAL_QUANTIZATION_VERSION, "general.quantization_version"
|
223
|
-
{ LLM_KV_GENERAL_ALIGNMENT, "general.alignment"
|
224
|
-
{ LLM_KV_GENERAL_NAME, "general.name"
|
225
|
-
{ LLM_KV_GENERAL_AUTHOR, "general.author"
|
226
|
-
{ LLM_KV_GENERAL_URL, "general.url"
|
227
|
-
{ LLM_KV_GENERAL_DESCRIPTION, "general.description"
|
228
|
-
{ LLM_KV_GENERAL_LICENSE, "general.license"
|
229
|
-
{ LLM_KV_GENERAL_SOURCE_URL, "general.
|
230
|
-
{ LLM_KV_GENERAL_SOURCE_HF_REPO, "general.
|
225
|
+
{ LLM_KV_GENERAL_ARCHITECTURE, "general.architecture" },
|
226
|
+
{ LLM_KV_GENERAL_QUANTIZATION_VERSION, "general.quantization_version" },
|
227
|
+
{ LLM_KV_GENERAL_ALIGNMENT, "general.alignment" },
|
228
|
+
{ LLM_KV_GENERAL_NAME, "general.name" },
|
229
|
+
{ LLM_KV_GENERAL_AUTHOR, "general.author" },
|
230
|
+
{ LLM_KV_GENERAL_URL, "general.url" },
|
231
|
+
{ LLM_KV_GENERAL_DESCRIPTION, "general.description" },
|
232
|
+
{ LLM_KV_GENERAL_LICENSE, "general.license" },
|
233
|
+
{ LLM_KV_GENERAL_SOURCE_URL, "general.source.url" },
|
234
|
+
{ LLM_KV_GENERAL_SOURCE_HF_REPO, "general.source.huggingface.repository" },
|
231
235
|
|
232
236
|
{ LLM_KV_CONTEXT_LENGTH, "%s.context_length" },
|
233
237
|
{ LLM_KV_EMBEDDING_LENGTH, "%s.embedding_length" },
|
@@ -376,6 +380,21 @@ static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES =
|
|
376
380
|
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
377
381
|
},
|
378
382
|
},
|
383
|
+
{
|
384
|
+
LLM_ARCH_STARCODER,
|
385
|
+
{
|
386
|
+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
387
|
+
{ LLM_TENSOR_POS_EMBD, "position_embd" },
|
388
|
+
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
389
|
+
{ LLM_TENSOR_OUTPUT, "output" },
|
390
|
+
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
391
|
+
{ LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
|
392
|
+
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
393
|
+
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
394
|
+
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
395
|
+
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
396
|
+
},
|
397
|
+
},
|
379
398
|
{
|
380
399
|
LLM_ARCH_UNKNOWN,
|
381
400
|
{
|
@@ -430,7 +449,7 @@ struct LLM_TN {
|
|
430
449
|
//
|
431
450
|
|
432
451
|
#define GGUF_GET_KEY(ctx, dst, func, type, req, key) \
|
433
|
-
{ \
|
452
|
+
do { \
|
434
453
|
const std::string skey(key); \
|
435
454
|
const int kid = gguf_find_key(ctx, skey.c_str()); \
|
436
455
|
if (kid >= 0) { \
|
@@ -442,7 +461,7 @@ struct LLM_TN {
|
|
442
461
|
} else if (req) { \
|
443
462
|
throw std::runtime_error(format("key not found in model: %s", skey.c_str())); \
|
444
463
|
} \
|
445
|
-
}
|
464
|
+
} while (0)
|
446
465
|
|
447
466
|
//
|
448
467
|
// ggml helpers
|
@@ -680,6 +699,7 @@ struct llama_mmap {
|
|
680
699
|
if (!PrefetchVirtualMemory(GetCurrentProcess(), 1, &range, 0)) {
|
681
700
|
fprintf(stderr, "warning: PrefetchVirtualMemory failed: %s\n",
|
682
701
|
llama_format_win_err(GetLastError()).c_str());
|
702
|
+
}
|
683
703
|
}
|
684
704
|
#else
|
685
705
|
#pragma message("warning: You are building for pre-Windows 8; prefetch not supported")
|
@@ -862,10 +882,10 @@ static void llama_nop(struct ggml_tensor * tensor) { // don't offload by default
|
|
862
882
|
|
863
883
|
static std::string llama_token_to_str(const struct llama_context * ctx, llama_token token) {
|
864
884
|
std::vector<char> result(8, 0);
|
865
|
-
const int n_tokens = llama_token_to_piece(ctx, token, result.data(), result.size());
|
885
|
+
const int n_tokens = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size());
|
866
886
|
if (n_tokens < 0) {
|
867
887
|
result.resize(-n_tokens);
|
868
|
-
int check = llama_token_to_piece(ctx, token, result.data(), result.size());
|
888
|
+
int check = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size());
|
869
889
|
GGML_ASSERT(check == -n_tokens);
|
870
890
|
} else {
|
871
891
|
result.resize(n_tokens);
|
@@ -880,7 +900,7 @@ static std::string llama_token_to_str(const struct llama_context * ctx, llama_to
|
|
880
900
|
|
881
901
|
struct llama_state {
|
882
902
|
// We save the log callback globally
|
883
|
-
|
903
|
+
ggml_log_callback log_callback = llama_log_callback_default;
|
884
904
|
void * log_callback_user_data = nullptr;
|
885
905
|
};
|
886
906
|
|
@@ -889,9 +909,11 @@ static llama_state g_state;
|
|
889
909
|
// available llama models
|
890
910
|
enum e_model {
|
891
911
|
MODEL_UNKNOWN,
|
912
|
+
MODEL_1B,
|
892
913
|
MODEL_3B,
|
893
914
|
MODEL_7B,
|
894
915
|
MODEL_13B,
|
916
|
+
MODEL_15B,
|
895
917
|
MODEL_30B,
|
896
918
|
MODEL_34B,
|
897
919
|
MODEL_40B,
|
@@ -901,24 +923,24 @@ enum e_model {
|
|
901
923
|
|
902
924
|
static const size_t kB = 1024;
|
903
925
|
static const size_t MB = kB*kB;
|
926
|
+
static const size_t GB = kB*kB*kB;
|
904
927
|
|
905
|
-
// default hparams (LLaMA 7B)
|
906
928
|
struct llama_hparams {
|
907
|
-
|
908
|
-
uint32_t
|
909
|
-
uint32_t
|
910
|
-
uint32_t n_embd
|
911
|
-
uint32_t n_head
|
912
|
-
uint32_t n_head_kv
|
913
|
-
uint32_t n_layer
|
914
|
-
uint32_t n_rot
|
915
|
-
uint32_t n_ff
|
916
|
-
|
917
|
-
float f_norm_eps
|
918
|
-
float f_norm_rms_eps
|
919
|
-
|
920
|
-
float
|
921
|
-
float
|
929
|
+
bool vocab_only;
|
930
|
+
uint32_t n_vocab;
|
931
|
+
uint32_t n_ctx_train; // context size the model was trained on
|
932
|
+
uint32_t n_embd;
|
933
|
+
uint32_t n_head;
|
934
|
+
uint32_t n_head_kv;
|
935
|
+
uint32_t n_layer;
|
936
|
+
uint32_t n_rot;
|
937
|
+
uint32_t n_ff;
|
938
|
+
|
939
|
+
float f_norm_eps;
|
940
|
+
float f_norm_rms_eps;
|
941
|
+
|
942
|
+
float rope_freq_base_train;
|
943
|
+
float rope_freq_scale_train;
|
922
944
|
|
923
945
|
bool operator!=(const llama_hparams & other) const {
|
924
946
|
return static_cast<bool>(memcmp(this, &other, sizeof(llama_hparams))); // NOLINT
|
@@ -935,15 +957,18 @@ struct llama_hparams {
|
|
935
957
|
uint32_t n_embd_gqa() const {
|
936
958
|
return n_embd/n_gqa();
|
937
959
|
}
|
960
|
+
};
|
938
961
|
|
939
|
-
|
940
|
-
|
941
|
-
|
942
|
-
|
943
|
-
|
944
|
-
|
945
|
-
|
946
|
-
|
962
|
+
struct llama_cparams {
|
963
|
+
uint32_t n_ctx; // context size used during inference
|
964
|
+
uint32_t n_batch;
|
965
|
+
uint32_t n_threads; // number of threads to use for generation
|
966
|
+
uint32_t n_threads_batch; // number of threads to use for batch processing
|
967
|
+
|
968
|
+
float rope_freq_base;
|
969
|
+
float rope_freq_scale;
|
970
|
+
|
971
|
+
bool mul_mat_q;
|
947
972
|
};
|
948
973
|
|
949
974
|
struct llama_layer {
|
@@ -960,16 +985,47 @@ struct llama_layer {
|
|
960
985
|
struct ggml_tensor * wo;
|
961
986
|
struct ggml_tensor * wqkv;
|
962
987
|
|
988
|
+
// attention bias
|
989
|
+
struct ggml_tensor * bo;
|
990
|
+
struct ggml_tensor * bqkv;
|
991
|
+
|
963
992
|
// normalization
|
964
993
|
struct ggml_tensor * ffn_norm;
|
994
|
+
struct ggml_tensor * ffn_norm_b;
|
965
995
|
|
966
996
|
// ff
|
967
997
|
struct ggml_tensor * w1; // ffn_gate
|
968
998
|
struct ggml_tensor * w2; // ffn_down
|
969
999
|
struct ggml_tensor * w3; // ffn_up
|
1000
|
+
|
1001
|
+
// ff bias
|
1002
|
+
struct ggml_tensor * b2; // ffn_down
|
1003
|
+
struct ggml_tensor * b3; // ffn_up
|
1004
|
+
};
|
1005
|
+
|
1006
|
+
struct llama_kv_cell {
|
1007
|
+
llama_pos pos = -1;
|
1008
|
+
llama_pos delta = 0;
|
1009
|
+
|
1010
|
+
std::set<llama_seq_id> seq_id;
|
1011
|
+
|
1012
|
+
bool has_seq_id(const llama_seq_id & id) const {
|
1013
|
+
return seq_id.find(id) != seq_id.end();
|
1014
|
+
}
|
970
1015
|
};
|
971
1016
|
|
1017
|
+
// ring-buffer of cached KV data
|
972
1018
|
struct llama_kv_cache {
|
1019
|
+
bool has_shift = false;
|
1020
|
+
|
1021
|
+
uint32_t head = 0;
|
1022
|
+
uint32_t size = 0;
|
1023
|
+
|
1024
|
+
// computed before each graph build
|
1025
|
+
uint32_t n = 0;
|
1026
|
+
|
1027
|
+
std::vector<llama_kv_cell> cells;
|
1028
|
+
|
973
1029
|
struct ggml_tensor * k = NULL;
|
974
1030
|
struct ggml_tensor * v = NULL;
|
975
1031
|
|
@@ -977,8 +1033,6 @@ struct llama_kv_cache {
|
|
977
1033
|
|
978
1034
|
llama_buffer buf;
|
979
1035
|
|
980
|
-
int n; // number of tokens currently in the cache
|
981
|
-
|
982
1036
|
~llama_kv_cache() {
|
983
1037
|
if (ctx) {
|
984
1038
|
ggml_free(ctx);
|
@@ -1040,10 +1094,11 @@ struct llama_model {
|
|
1040
1094
|
|
1041
1095
|
std::string name = "n/a";
|
1042
1096
|
|
1043
|
-
llama_hparams hparams;
|
1097
|
+
llama_hparams hparams = {};
|
1044
1098
|
llama_vocab vocab;
|
1045
1099
|
|
1046
1100
|
struct ggml_tensor * tok_embeddings;
|
1101
|
+
struct ggml_tensor * pos_embeddings;
|
1047
1102
|
|
1048
1103
|
struct ggml_tensor * output_norm;
|
1049
1104
|
struct ggml_tensor * output_norm_b;
|
@@ -1091,11 +1146,8 @@ struct llama_model {
|
|
1091
1146
|
};
|
1092
1147
|
|
1093
1148
|
struct llama_context {
|
1094
|
-
llama_context(const llama_model & model) : model(model),
|
1149
|
+
llama_context(const llama_model & model) : model(model), t_start_us(model.t_start_us), t_load_us(model.t_load_us) {}
|
1095
1150
|
~llama_context() {
|
1096
|
-
if (model_owner) {
|
1097
|
-
delete &model;
|
1098
|
-
}
|
1099
1151
|
#ifdef GGML_USE_METAL
|
1100
1152
|
if (ctx_metal) {
|
1101
1153
|
ggml_metal_free(ctx_metal);
|
@@ -1106,27 +1158,26 @@ struct llama_context {
|
|
1106
1158
|
}
|
1107
1159
|
}
|
1108
1160
|
|
1161
|
+
llama_cparams cparams;
|
1162
|
+
|
1163
|
+
const llama_model & model;
|
1164
|
+
|
1165
|
+
// key + value cache for the self attention
|
1166
|
+
struct llama_kv_cache kv_self;
|
1167
|
+
|
1109
1168
|
std::mt19937 rng;
|
1110
1169
|
|
1111
1170
|
bool has_evaluated_once = false;
|
1112
1171
|
|
1172
|
+
int64_t t_start_us;
|
1173
|
+
int64_t t_load_us;
|
1113
1174
|
int64_t t_sample_us = 0;
|
1114
|
-
int64_t t_eval_us = 0;
|
1115
1175
|
int64_t t_p_eval_us = 0;
|
1176
|
+
int64_t t_eval_us = 0;
|
1116
1177
|
|
1117
1178
|
int32_t n_sample = 0; // number of tokens sampled
|
1118
|
-
int32_t n_eval = 0; // number of eval calls
|
1119
1179
|
int32_t n_p_eval = 0; // number of tokens in eval calls for the prompt (with batch size > 1)
|
1120
|
-
|
1121
|
-
const llama_model & model;
|
1122
|
-
|
1123
|
-
bool model_owner = false;
|
1124
|
-
|
1125
|
-
int64_t t_load_us;
|
1126
|
-
int64_t t_start_us;
|
1127
|
-
|
1128
|
-
// key + value cache for the self attention
|
1129
|
-
struct llama_kv_cache kv_self;
|
1180
|
+
int32_t n_eval = 0; // number of eval calls
|
1130
1181
|
|
1131
1182
|
// decode output (2-dimensional array: [n_tokens][n_vocab])
|
1132
1183
|
std::vector<float> logits;
|
@@ -1161,16 +1212,23 @@ static bool llama_kv_cache_init(
|
|
1161
1212
|
const struct llama_hparams & hparams,
|
1162
1213
|
struct llama_kv_cache & cache,
|
1163
1214
|
ggml_type wtype,
|
1164
|
-
|
1215
|
+
uint32_t n_ctx,
|
1165
1216
|
int n_gpu_layers) {
|
1166
|
-
const
|
1167
|
-
const
|
1217
|
+
const uint32_t n_embd = hparams.n_embd_gqa();
|
1218
|
+
const uint32_t n_layer = hparams.n_layer;
|
1168
1219
|
|
1169
1220
|
const int64_t n_mem = n_layer*n_ctx;
|
1170
1221
|
const int64_t n_elements = n_embd*n_mem;
|
1171
1222
|
|
1223
|
+
cache.has_shift = false;
|
1224
|
+
|
1225
|
+
cache.head = 0;
|
1226
|
+
cache.size = n_ctx;
|
1227
|
+
|
1228
|
+
cache.cells.clear();
|
1229
|
+
cache.cells.resize(n_ctx);
|
1230
|
+
|
1172
1231
|
cache.buf.resize(2u*n_elements*ggml_type_size(wtype) + 2u*MB);
|
1173
|
-
cache.n = 0;
|
1174
1232
|
|
1175
1233
|
struct ggml_init_params params;
|
1176
1234
|
params.mem_size = cache.buf.size;
|
@@ -1191,17 +1249,154 @@ static bool llama_kv_cache_init(
|
|
1191
1249
|
|
1192
1250
|
(void) n_gpu_layers;
|
1193
1251
|
#ifdef GGML_USE_CUBLAS
|
1194
|
-
|
1252
|
+
size_t vram_kv_cache = 0;
|
1253
|
+
|
1254
|
+
if (n_gpu_layers > (int)n_layer + 1) {
|
1195
1255
|
ggml_cuda_assign_buffers_no_scratch(cache.v);
|
1256
|
+
LLAMA_LOG_INFO("%s: offloading v cache to GPU\n", __func__);
|
1257
|
+
vram_kv_cache += ggml_nbytes(cache.v);
|
1196
1258
|
}
|
1197
|
-
if (n_gpu_layers > n_layer + 2) {
|
1259
|
+
if (n_gpu_layers > (int)n_layer + 2) {
|
1198
1260
|
ggml_cuda_assign_buffers_no_scratch(cache.k);
|
1261
|
+
LLAMA_LOG_INFO("%s: offloading k cache to GPU\n", __func__);
|
1262
|
+
vram_kv_cache += ggml_nbytes(cache.k);
|
1263
|
+
}
|
1264
|
+
if (vram_kv_cache > 0) {
|
1265
|
+
LLAMA_LOG_INFO("%s: VRAM kv self = %.2f MB\n", __func__, vram_kv_cache / 1024.0 / 1024.0);
|
1199
1266
|
}
|
1200
1267
|
#endif // GGML_USE_CUBLAS
|
1201
1268
|
|
1202
1269
|
return true;
|
1203
1270
|
}
|
1204
1271
|
|
1272
|
+
// find an empty slot of size "n_tokens" in the cache
|
1273
|
+
// updates the cache head
|
1274
|
+
static bool llama_kv_cache_find_slot(
|
1275
|
+
struct llama_kv_cache & cache,
|
1276
|
+
const struct llama_batch & batch) {
|
1277
|
+
const uint32_t n_ctx = cache.size;
|
1278
|
+
const uint32_t n_tokens = batch.n_tokens;
|
1279
|
+
|
1280
|
+
if (n_tokens > n_ctx) {
|
1281
|
+
LLAMA_LOG_ERROR("%s: n_tokens=%d > n_ctx=%d\n", __func__, n_tokens, n_ctx);
|
1282
|
+
return false;
|
1283
|
+
}
|
1284
|
+
|
1285
|
+
uint32_t n_tested = 0;
|
1286
|
+
|
1287
|
+
while (true) {
|
1288
|
+
if (cache.head + n_tokens > n_ctx) {
|
1289
|
+
cache.head = 0;
|
1290
|
+
n_tested += n_ctx - cache.head;
|
1291
|
+
continue;
|
1292
|
+
}
|
1293
|
+
|
1294
|
+
bool found = true;
|
1295
|
+
for (uint32_t i = 0; i < n_tokens; i++) {
|
1296
|
+
if (cache.cells[cache.head + i].pos >= 0) {
|
1297
|
+
found = false;
|
1298
|
+
cache.head += i + 1;
|
1299
|
+
n_tested += i + 1;
|
1300
|
+
break;
|
1301
|
+
}
|
1302
|
+
}
|
1303
|
+
|
1304
|
+
if (found) {
|
1305
|
+
break;
|
1306
|
+
}
|
1307
|
+
|
1308
|
+
if (n_tested >= n_ctx) {
|
1309
|
+
//LLAMA_LOG_ERROR("%s: failed to find a slot for %d tokens\n", __func__, n_tokens);
|
1310
|
+
return false;
|
1311
|
+
}
|
1312
|
+
}
|
1313
|
+
|
1314
|
+
for (uint32_t i = 0; i < n_tokens; i++) {
|
1315
|
+
cache.cells[cache.head + i].pos = batch.pos[i];
|
1316
|
+
cache.cells[cache.head + i].seq_id.insert(batch.seq_id[i]);
|
1317
|
+
}
|
1318
|
+
|
1319
|
+
return true;
|
1320
|
+
}
|
1321
|
+
|
1322
|
+
// find how many cells are currently in use
|
1323
|
+
static int32_t llama_kv_cache_cell_max(const struct llama_kv_cache & cache) {
|
1324
|
+
for (uint32_t i = cache.size - 1; i > 0; --i) {
|
1325
|
+
if (cache.cells[i].pos >= 0 && !cache.cells[i].seq_id.empty()) {
|
1326
|
+
return i + 1;
|
1327
|
+
}
|
1328
|
+
}
|
1329
|
+
|
1330
|
+
return 0;
|
1331
|
+
}
|
1332
|
+
|
1333
|
+
static void llama_kv_cache_tokens_rm(struct llama_kv_cache & cache, int32_t c0, int32_t c1) {
|
1334
|
+
if (c0 < 0) c0 = 0;
|
1335
|
+
if (c1 < 0) c1 = cache.size;
|
1336
|
+
|
1337
|
+
for (int32_t i = c0; i < c1; ++i) {
|
1338
|
+
cache.cells[i].pos = -1;
|
1339
|
+
cache.cells[i].seq_id.clear();
|
1340
|
+
}
|
1341
|
+
}
|
1342
|
+
|
1343
|
+
static void llama_kv_cache_seq_rm(
|
1344
|
+
struct llama_kv_cache & cache,
|
1345
|
+
llama_seq_id seq_id,
|
1346
|
+
llama_pos p0,
|
1347
|
+
llama_pos p1) {
|
1348
|
+
for (uint32_t i = 0; i < cache.size; ++i) {
|
1349
|
+
if (cache.cells[i].has_seq_id(seq_id) && cache.cells[i].pos >= p0 && cache.cells[i].pos < p1) {
|
1350
|
+
cache.cells[i].seq_id.erase(seq_id);
|
1351
|
+
if (cache.cells[i].seq_id.empty()) {
|
1352
|
+
cache.cells[i].pos = -1;
|
1353
|
+
}
|
1354
|
+
}
|
1355
|
+
}
|
1356
|
+
}
|
1357
|
+
|
1358
|
+
static void llama_kv_cache_seq_cp(
|
1359
|
+
struct llama_kv_cache & cache,
|
1360
|
+
llama_seq_id seq_id_src,
|
1361
|
+
llama_seq_id seq_id_dst,
|
1362
|
+
llama_pos p0,
|
1363
|
+
llama_pos p1) {
|
1364
|
+
for (uint32_t i = 0; i < cache.size; ++i) {
|
1365
|
+
if (cache.cells[i].has_seq_id(seq_id_src) && cache.cells[i].pos >= p0 && cache.cells[i].pos < p1) {
|
1366
|
+
cache.cells[i].seq_id.insert(seq_id_dst);
|
1367
|
+
}
|
1368
|
+
}
|
1369
|
+
}
|
1370
|
+
|
1371
|
+
static void llama_kv_cache_seq_keep(struct llama_kv_cache & cache, llama_seq_id seq_id) {
|
1372
|
+
for (uint32_t i = 0; i < cache.size; ++i) {
|
1373
|
+
if (!cache.cells[i].has_seq_id(seq_id)) {
|
1374
|
+
cache.cells[i].pos = -1;
|
1375
|
+
cache.cells[i].seq_id.clear();
|
1376
|
+
}
|
1377
|
+
}
|
1378
|
+
}
|
1379
|
+
|
1380
|
+
static void llama_kv_cache_seq_shift(
|
1381
|
+
struct llama_kv_cache & cache,
|
1382
|
+
llama_seq_id seq_id,
|
1383
|
+
llama_pos p0,
|
1384
|
+
llama_pos p1,
|
1385
|
+
llama_pos delta) {
|
1386
|
+
for (uint32_t i = 0; i < cache.size; ++i) {
|
1387
|
+
if (cache.cells[i].has_seq_id(seq_id) && cache.cells[i].pos >= p0 && cache.cells[i].pos < p1) {
|
1388
|
+
cache.cells[i].pos += delta;
|
1389
|
+
if (cache.cells[i].pos < 0) {
|
1390
|
+
cache.cells[i].pos = -1;
|
1391
|
+
cache.cells[i].seq_id.clear();
|
1392
|
+
} else {
|
1393
|
+
cache.has_shift = true;
|
1394
|
+
cache.cells[i].delta = delta;
|
1395
|
+
}
|
1396
|
+
}
|
1397
|
+
}
|
1398
|
+
}
|
1399
|
+
|
1205
1400
|
//
|
1206
1401
|
// model loading and saving
|
1207
1402
|
//
|
@@ -1244,6 +1439,7 @@ struct llama_model_loader {
|
|
1244
1439
|
int n_created = 0;
|
1245
1440
|
|
1246
1441
|
int64_t n_elements = 0;
|
1442
|
+
size_t n_bytes = 0;
|
1247
1443
|
|
1248
1444
|
bool use_mmap = false;
|
1249
1445
|
|
@@ -1276,6 +1472,7 @@ struct llama_model_loader {
|
|
1276
1472
|
const char * name = gguf_get_tensor_name(ctx_gguf, i);
|
1277
1473
|
struct ggml_tensor * t = ggml_get_tensor(ctx_meta, name);
|
1278
1474
|
n_elements += ggml_nelements(t);
|
1475
|
+
n_bytes += ggml_nbytes(t);
|
1279
1476
|
}
|
1280
1477
|
|
1281
1478
|
LLAMA_LOG_INFO("%s: loaded meta data with %d key-value pairs and %d tensors from %s (version %s)\n",
|
@@ -1521,7 +1718,7 @@ struct llama_model_loader {
|
|
1521
1718
|
lmlock->grow_to(size_lock);
|
1522
1719
|
}
|
1523
1720
|
break;
|
1524
|
-
#
|
1721
|
+
#ifdef GGML_USE_CUBLAS
|
1525
1722
|
case GGML_BACKEND_GPU:
|
1526
1723
|
case GGML_BACKEND_GPU_SPLIT:
|
1527
1724
|
// old code:
|
@@ -1554,7 +1751,15 @@ struct llama_model_loader {
|
|
1554
1751
|
// load LLaMA models
|
1555
1752
|
//
|
1556
1753
|
|
1557
|
-
std::string
|
1754
|
+
static std::string llama_model_arch_name(llm_arch arch) {
|
1755
|
+
auto it = LLM_ARCH_NAMES.find(arch);
|
1756
|
+
if (it == LLM_ARCH_NAMES.end()) {
|
1757
|
+
return "unknown";
|
1758
|
+
}
|
1759
|
+
return it->second;
|
1760
|
+
}
|
1761
|
+
|
1762
|
+
static std::string llama_model_ftype_name(llama_ftype ftype) {
|
1558
1763
|
if (ftype & LLAMA_FTYPE_GUESSED) {
|
1559
1764
|
return llama_model_ftype_name((enum llama_ftype) (ftype & ~LLAMA_FTYPE_GUESSED)) + " (guessed)";
|
1560
1765
|
}
|
@@ -1587,9 +1792,11 @@ std::string llama_model_ftype_name(enum llama_ftype ftype) {
|
|
1587
1792
|
|
1588
1793
|
static const char * llama_model_type_name(e_model type) {
|
1589
1794
|
switch (type) {
|
1795
|
+
case MODEL_1B: return "1B";
|
1590
1796
|
case MODEL_3B: return "3B";
|
1591
1797
|
case MODEL_7B: return "7B";
|
1592
1798
|
case MODEL_13B: return "13B";
|
1799
|
+
case MODEL_15B: return "15B";
|
1593
1800
|
case MODEL_30B: return "30B";
|
1594
1801
|
case MODEL_34B: return "34B";
|
1595
1802
|
case MODEL_40B: return "40B";
|
@@ -1608,10 +1815,7 @@ static void llm_load_arch(llama_model_loader & ml, llama_model & model) {
|
|
1608
1815
|
|
1609
1816
|
static void llm_load_hparams(
|
1610
1817
|
llama_model_loader & ml,
|
1611
|
-
llama_model & model
|
1612
|
-
int n_ctx,
|
1613
|
-
float rope_freq_base,
|
1614
|
-
float rope_freq_scale) {
|
1818
|
+
llama_model & model) {
|
1615
1819
|
struct gguf_context * ctx = ml.ctx_gguf;
|
1616
1820
|
|
1617
1821
|
const auto kv = LLM_KV(model.arch);
|
@@ -1622,40 +1826,25 @@ static void llm_load_hparams(
|
|
1622
1826
|
GGUF_GET_KEY(ctx, model.name, gguf_get_val_str, GGUF_TYPE_STRING, false, kv(LLM_KV_GENERAL_NAME));
|
1623
1827
|
|
1624
1828
|
// get hparams kv
|
1625
|
-
GGUF_GET_KEY(ctx, hparams.n_vocab, gguf_get_arr_n, GGUF_TYPE_ARRAY,
|
1626
|
-
GGUF_GET_KEY(ctx, hparams.n_ctx_train, gguf_get_val_u32, GGUF_TYPE_UINT32,
|
1627
|
-
GGUF_GET_KEY(ctx, hparams.n_embd, gguf_get_val_u32, GGUF_TYPE_UINT32,
|
1628
|
-
GGUF_GET_KEY(ctx, hparams.n_ff, gguf_get_val_u32, GGUF_TYPE_UINT32,
|
1629
|
-
GGUF_GET_KEY(ctx, hparams.n_head, gguf_get_val_u32, GGUF_TYPE_UINT32,
|
1630
|
-
GGUF_GET_KEY(ctx, hparams.n_layer, gguf_get_val_u32, GGUF_TYPE_UINT32,
|
1829
|
+
GGUF_GET_KEY(ctx, hparams.n_vocab, gguf_get_arr_n, GGUF_TYPE_ARRAY, true, kv(LLM_KV_TOKENIZER_LIST));
|
1830
|
+
GGUF_GET_KEY(ctx, hparams.n_ctx_train, gguf_get_val_u32, GGUF_TYPE_UINT32, true, kv(LLM_KV_CONTEXT_LENGTH));
|
1831
|
+
GGUF_GET_KEY(ctx, hparams.n_embd, gguf_get_val_u32, GGUF_TYPE_UINT32, true, kv(LLM_KV_EMBEDDING_LENGTH));
|
1832
|
+
GGUF_GET_KEY(ctx, hparams.n_ff, gguf_get_val_u32, GGUF_TYPE_UINT32, true, kv(LLM_KV_FEED_FORWARD_LENGTH));
|
1833
|
+
GGUF_GET_KEY(ctx, hparams.n_head, gguf_get_val_u32, GGUF_TYPE_UINT32, true, kv(LLM_KV_ATTENTION_HEAD_COUNT));
|
1834
|
+
GGUF_GET_KEY(ctx, hparams.n_layer, gguf_get_val_u32, GGUF_TYPE_UINT32, true, kv(LLM_KV_BLOCK_COUNT));
|
1631
1835
|
|
1632
1836
|
// n_head_kv is optional, default to n_head
|
1633
1837
|
hparams.n_head_kv = hparams.n_head;
|
1634
1838
|
GGUF_GET_KEY(ctx, hparams.n_head_kv, gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_ATTENTION_HEAD_COUNT_KV));
|
1635
1839
|
|
1636
|
-
//
|
1637
|
-
|
1638
|
-
|
1639
|
-
|
1640
|
-
llama_context_params defaults = llama_context_default_params();
|
1641
|
-
|
1642
|
-
// rope_freq_base
|
1643
|
-
{
|
1644
|
-
float ropebase = 10000.0f;
|
1645
|
-
GGUF_GET_KEY(ctx, ropebase, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ROPE_FREQ_BASE));
|
1646
|
-
if (ropebase != 10000.0f && rope_freq_base == defaults.rope_freq_base) {
|
1647
|
-
rope_freq_base = ropebase;
|
1648
|
-
}
|
1649
|
-
}
|
1840
|
+
// rope_freq_base (optional)
|
1841
|
+
hparams.rope_freq_base_train = 10000.0f;
|
1842
|
+
GGUF_GET_KEY(ctx, hparams.rope_freq_base_train, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ROPE_FREQ_BASE));
|
1650
1843
|
|
1651
1844
|
// rope_freq_scale (inverse of the kv) is optional
|
1652
|
-
|
1653
|
-
|
1654
|
-
|
1655
|
-
if (ropescale != 1.0f && rope_freq_scale == defaults.rope_freq_scale) {
|
1656
|
-
rope_freq_scale = 1.0f/ropescale;
|
1657
|
-
}
|
1658
|
-
}
|
1845
|
+
float ropescale = 1.0f;
|
1846
|
+
GGUF_GET_KEY(ctx, ropescale, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ROPE_SCALE_LINEAR));
|
1847
|
+
hparams.rope_freq_scale_train = 1.0f/ropescale;
|
1659
1848
|
|
1660
1849
|
// sanity check for n_rot (optional)
|
1661
1850
|
{
|
@@ -1707,14 +1896,21 @@ static void llm_load_hparams(
|
|
1707
1896
|
default: model.type = e_model::MODEL_UNKNOWN;
|
1708
1897
|
}
|
1709
1898
|
} break;
|
1899
|
+
case LLM_ARCH_STARCODER:
|
1900
|
+
{
|
1901
|
+
GGUF_GET_KEY(ctx, hparams.f_norm_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_LAYERNORM_EPS));
|
1902
|
+
switch (hparams.n_layer) {
|
1903
|
+
case 24: model.type = e_model::MODEL_1B; break;
|
1904
|
+
case 36: model.type = e_model::MODEL_3B; break;
|
1905
|
+
case 42: model.type = e_model::MODEL_7B; break;
|
1906
|
+
case 40: model.type = e_model::MODEL_15B; break;
|
1907
|
+
default: model.type = e_model::MODEL_UNKNOWN;
|
1908
|
+
}
|
1909
|
+
} break;
|
1710
1910
|
default: (void)0;
|
1711
|
-
}
|
1911
|
+
}
|
1712
1912
|
|
1713
1913
|
model.ftype = ml.ftype;
|
1714
|
-
|
1715
|
-
hparams.n_ctx = n_ctx;
|
1716
|
-
hparams.rope_freq_base = rope_freq_base;
|
1717
|
-
hparams.rope_freq_scale = rope_freq_scale;
|
1718
1914
|
}
|
1719
1915
|
|
1720
1916
|
// TODO: This should probably be in llama.h
|
@@ -1735,20 +1931,18 @@ static void llm_load_vocab(
|
|
1735
1931
|
throw std::runtime_error("cannot find tokenizer vocab in model file\n");
|
1736
1932
|
}
|
1737
1933
|
|
1934
|
+
const float * scores = nullptr;
|
1738
1935
|
const int score_idx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_SCORES).c_str());
|
1739
|
-
if (score_idx
|
1740
|
-
|
1936
|
+
if (score_idx != -1) {
|
1937
|
+
scores = (const float * ) gguf_get_arr_data(ctx, score_idx);
|
1741
1938
|
}
|
1742
1939
|
|
1743
|
-
const
|
1744
|
-
|
1940
|
+
const int * toktypes = nullptr;
|
1745
1941
|
const int toktype_idx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_TOKEN_TYPE).c_str());
|
1746
|
-
if (toktype_idx
|
1747
|
-
|
1942
|
+
if (toktype_idx != -1) {
|
1943
|
+
toktypes = (const int * ) gguf_get_arr_data(ctx, toktype_idx);
|
1748
1944
|
}
|
1749
1945
|
|
1750
|
-
const int * toktypes = (const int * ) gguf_get_arr_data(ctx, toktype_idx);
|
1751
|
-
|
1752
1946
|
// determine vocab type
|
1753
1947
|
{
|
1754
1948
|
std::string tokenizer_name;
|
@@ -1816,8 +2010,8 @@ static void llm_load_vocab(
|
|
1816
2010
|
|
1817
2011
|
auto & token_data = vocab.id_to_token[i];
|
1818
2012
|
token_data.text = std::move(word);
|
1819
|
-
token_data.score = scores[i];
|
1820
|
-
token_data.type = (llama_token_type) toktypes[i];
|
2013
|
+
token_data.score = scores ? scores[i] : 0.0f;
|
2014
|
+
token_data.type = toktypes ? (llama_token_type) toktypes[i] : LLAMA_TOKEN_TYPE_NORMAL;
|
1821
2015
|
}
|
1822
2016
|
|
1823
2017
|
// determine the newline token: LLaMA "<0x0A>" == 10 == '\n', Falcon 193 == '\n'
|
@@ -1840,27 +2034,31 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
|
|
1840
2034
|
const auto & vocab = model.vocab;
|
1841
2035
|
|
1842
2036
|
// hparams
|
1843
|
-
LLAMA_LOG_INFO("%s: format
|
1844
|
-
LLAMA_LOG_INFO("%s: arch
|
1845
|
-
LLAMA_LOG_INFO("%s: vocab type
|
1846
|
-
LLAMA_LOG_INFO("%s: n_vocab
|
1847
|
-
LLAMA_LOG_INFO("%s: n_merges
|
1848
|
-
LLAMA_LOG_INFO("%s: n_ctx_train
|
1849
|
-
LLAMA_LOG_INFO("%s:
|
1850
|
-
LLAMA_LOG_INFO("%s:
|
1851
|
-
LLAMA_LOG_INFO("%s:
|
1852
|
-
LLAMA_LOG_INFO("%s:
|
1853
|
-
LLAMA_LOG_INFO("%s:
|
1854
|
-
LLAMA_LOG_INFO("%s:
|
1855
|
-
LLAMA_LOG_INFO("%s:
|
1856
|
-
LLAMA_LOG_INFO("%s:
|
1857
|
-
LLAMA_LOG_INFO("%s:
|
1858
|
-
LLAMA_LOG_INFO("%s:
|
1859
|
-
LLAMA_LOG_INFO("%s:
|
1860
|
-
LLAMA_LOG_INFO("%s:
|
1861
|
-
LLAMA_LOG_INFO("%s: model
|
1862
|
-
LLAMA_LOG_INFO("%s: model
|
1863
|
-
|
2037
|
+
LLAMA_LOG_INFO("%s: format = %s\n", __func__, llama_file_version_name(ml.fver));
|
2038
|
+
LLAMA_LOG_INFO("%s: arch = %s\n", __func__, LLM_ARCH_NAMES.at(model.arch).c_str());
|
2039
|
+
LLAMA_LOG_INFO("%s: vocab type = %s\n", __func__, vocab.type == LLAMA_VOCAB_TYPE_SPM ? "SPM" : "BPE"); // TODO: fix
|
2040
|
+
LLAMA_LOG_INFO("%s: n_vocab = %u\n", __func__, hparams.n_vocab);
|
2041
|
+
LLAMA_LOG_INFO("%s: n_merges = %u\n", __func__, (int) vocab.bpe_ranks.size());
|
2042
|
+
LLAMA_LOG_INFO("%s: n_ctx_train = %u\n", __func__, hparams.n_ctx_train);
|
2043
|
+
LLAMA_LOG_INFO("%s: n_embd = %u\n", __func__, hparams.n_embd);
|
2044
|
+
LLAMA_LOG_INFO("%s: n_head = %u\n", __func__, hparams.n_head);
|
2045
|
+
LLAMA_LOG_INFO("%s: n_head_kv = %u\n", __func__, hparams.n_head_kv);
|
2046
|
+
LLAMA_LOG_INFO("%s: n_layer = %u\n", __func__, hparams.n_layer);
|
2047
|
+
LLAMA_LOG_INFO("%s: n_rot = %u\n", __func__, hparams.n_rot); // a.k.a. n_embd_head, n_head_dim
|
2048
|
+
LLAMA_LOG_INFO("%s: n_gqa = %u\n", __func__, hparams.n_gqa());
|
2049
|
+
LLAMA_LOG_INFO("%s: f_norm_eps = %.1e\n", __func__, hparams.f_norm_eps);
|
2050
|
+
LLAMA_LOG_INFO("%s: f_norm_rms_eps = %.1e\n", __func__, hparams.f_norm_rms_eps);
|
2051
|
+
LLAMA_LOG_INFO("%s: n_ff = %u\n", __func__, hparams.n_ff);
|
2052
|
+
LLAMA_LOG_INFO("%s: freq_base_train = %.1f\n", __func__, hparams.rope_freq_base_train);
|
2053
|
+
LLAMA_LOG_INFO("%s: freq_scale_train = %g\n", __func__, hparams.rope_freq_scale_train);
|
2054
|
+
LLAMA_LOG_INFO("%s: model type = %s\n", __func__, llama_model_type_name(model.type));
|
2055
|
+
LLAMA_LOG_INFO("%s: model ftype = %s\n", __func__, llama_model_ftype_name(model.ftype).c_str());
|
2056
|
+
LLAMA_LOG_INFO("%s: model params = %.2f B\n", __func__, ml.n_elements*1e-9);
|
2057
|
+
if (ml.n_bytes < GB) {
|
2058
|
+
LLAMA_LOG_INFO("%s: model size = %.2f MiB (%.2f BPW) \n", __func__, ml.n_bytes/1024.0/1024.0, ml.n_bytes*8.0/ml.n_elements);
|
2059
|
+
} else {
|
2060
|
+
LLAMA_LOG_INFO("%s: model size = %.2f GiB (%.2f BPW) \n", __func__, ml.n_bytes/1024.0/1024.0/1024.0, ml.n_bytes*8.0/ml.n_elements);
|
2061
|
+
}
|
1864
2062
|
|
1865
2063
|
// general kv
|
1866
2064
|
LLAMA_LOG_INFO("%s: general.name = %s\n", __func__, model.name.c_str());
|
@@ -1877,13 +2075,9 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
|
|
1877
2075
|
static void llm_load_tensors(
|
1878
2076
|
llama_model_loader & ml,
|
1879
2077
|
llama_model & model,
|
1880
|
-
int n_batch,
|
1881
2078
|
int n_gpu_layers,
|
1882
2079
|
int main_gpu,
|
1883
2080
|
const float * tensor_split,
|
1884
|
-
const bool mul_mat_q,
|
1885
|
-
bool low_vram,
|
1886
|
-
ggml_type memory_type,
|
1887
2081
|
bool use_mlock,
|
1888
2082
|
llama_progress_callback progress_callback,
|
1889
2083
|
void * progress_callback_user_data) {
|
@@ -1922,11 +2116,9 @@ static void llm_load_tensors(
|
|
1922
2116
|
}
|
1923
2117
|
|
1924
2118
|
(void) main_gpu;
|
1925
|
-
|
1926
|
-
#if defined(GGML_USE_CUBLAS)
|
2119
|
+
#ifdef GGML_USE_CUBLAS
|
1927
2120
|
LLAMA_LOG_INFO("%s: using " GGML_CUDA_NAME " for GPU acceleration\n", __func__);
|
1928
2121
|
ggml_cuda_set_main_device(main_gpu);
|
1929
|
-
ggml_cuda_set_mul_mat_q(mul_mat_q);
|
1930
2122
|
#define LLAMA_BACKEND_OFFLOAD GGML_BACKEND_GPU
|
1931
2123
|
#define LLAMA_BACKEND_OFFLOAD_SPLIT GGML_BACKEND_GPU_SPLIT
|
1932
2124
|
#elif defined(GGML_USE_CLBLAST)
|
@@ -1961,9 +2153,9 @@ static void llm_load_tensors(
|
|
1961
2153
|
// norm is not performance relevant on its own but keeping it in VRAM reduces data copying
|
1962
2154
|
// on Windows however this is detrimental unless everything is on the GPU
|
1963
2155
|
#ifndef _WIN32
|
1964
|
-
backend_norm =
|
2156
|
+
backend_norm = LLAMA_BACKEND_OFFLOAD;
|
1965
2157
|
#else
|
1966
|
-
backend_norm =
|
2158
|
+
backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
|
1967
2159
|
#endif // _WIN32
|
1968
2160
|
|
1969
2161
|
backend_output = LLAMA_BACKEND_OFFLOAD_SPLIT;
|
@@ -2027,9 +2219,9 @@ static void llm_load_tensors(
|
|
2027
2219
|
// norm is not performance relevant on its own but keeping it in VRAM reduces data copying
|
2028
2220
|
// on Windows however this is detrimental unless everything is on the GPU
|
2029
2221
|
#ifndef _WIN32
|
2030
|
-
backend_norm =
|
2222
|
+
backend_norm = LLAMA_BACKEND_OFFLOAD;
|
2031
2223
|
#else
|
2032
|
-
backend_norm =
|
2224
|
+
backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
|
2033
2225
|
#endif // _WIN32
|
2034
2226
|
|
2035
2227
|
backend_output = LLAMA_BACKEND_OFFLOAD_SPLIT;
|
@@ -2097,9 +2289,9 @@ static void llm_load_tensors(
|
|
2097
2289
|
// norm is not performance relevant on its own but keeping it in VRAM reduces data copying
|
2098
2290
|
// on Windows however this is detrimental unless everything is on the GPU
|
2099
2291
|
#ifndef _WIN32
|
2100
|
-
backend_norm =
|
2292
|
+
backend_norm = LLAMA_BACKEND_OFFLOAD;
|
2101
2293
|
#else
|
2102
|
-
backend_norm =
|
2294
|
+
backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
|
2103
2295
|
#endif // _WIN32
|
2104
2296
|
|
2105
2297
|
backend_output = LLAMA_BACKEND_OFFLOAD_SPLIT;
|
@@ -2160,29 +2352,100 @@ static void llm_load_tensors(
|
|
2160
2352
|
}
|
2161
2353
|
}
|
2162
2354
|
} break;
|
2355
|
+
case LLM_ARCH_STARCODER:
|
2356
|
+
{
|
2357
|
+
model.tok_embeddings = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
|
2358
|
+
model.pos_embeddings = ml.create_tensor(ctx, tn(LLM_TENSOR_POS_EMBD, "weight"), {n_embd, hparams.n_ctx_train}, GGML_BACKEND_CPU);
|
2359
|
+
|
2360
|
+
// output
|
2361
|
+
{
|
2362
|
+
ggml_backend backend_norm;
|
2363
|
+
ggml_backend backend_output;
|
2364
|
+
|
2365
|
+
if (n_gpu_layers > int(n_layer)) {
|
2366
|
+
// norm is not performance relevant on its own but keeping it in VRAM reduces data copying
|
2367
|
+
// on Windows however this is detrimental unless everything is on the GPU
|
2368
|
+
#ifndef _WIN32
|
2369
|
+
backend_norm = LLAMA_BACKEND_OFFLOAD;
|
2370
|
+
#else
|
2371
|
+
backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
|
2372
|
+
#endif // _WIN32
|
2373
|
+
|
2374
|
+
backend_output = LLAMA_BACKEND_OFFLOAD_SPLIT;
|
2375
|
+
} else {
|
2376
|
+
backend_norm = GGML_BACKEND_CPU;
|
2377
|
+
backend_output = GGML_BACKEND_CPU;
|
2378
|
+
}
|
2379
|
+
|
2380
|
+
model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm);
|
2381
|
+
model.output_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, backend_norm);
|
2382
|
+
model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
|
2383
|
+
|
2384
|
+
if (backend_norm == GGML_BACKEND_GPU) {
|
2385
|
+
vram_weights += ggml_nbytes(model.output_norm);
|
2386
|
+
vram_weights += ggml_nbytes(model.output_norm_b);
|
2387
|
+
}
|
2388
|
+
if (backend_output == GGML_BACKEND_GPU_SPLIT) {
|
2389
|
+
vram_weights += ggml_nbytes(model.output);
|
2390
|
+
}
|
2391
|
+
}
|
2392
|
+
|
2393
|
+
const uint32_t n_ff = hparams.n_ff;
|
2394
|
+
|
2395
|
+
const int i_gpu_start = n_layer - n_gpu_layers;
|
2396
|
+
|
2397
|
+
model.layers.resize(n_layer);
|
2398
|
+
|
2399
|
+
for (uint32_t i = 0; i < n_layer; ++i) {
|
2400
|
+
const ggml_backend backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT
|
2401
|
+
const ggml_backend backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT
|
2402
|
+
|
2403
|
+
auto & layer = model.layers[i];
|
2404
|
+
|
2405
|
+
layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend);
|
2406
|
+
layer.attn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, backend);
|
2407
|
+
|
2408
|
+
layer.wqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, backend_split);
|
2409
|
+
layer.bqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, backend_split);
|
2410
|
+
|
2411
|
+
layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split);
|
2412
|
+
layer.bo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, backend_split);
|
2413
|
+
|
2414
|
+
layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend);
|
2415
|
+
layer.ffn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, backend);
|
2416
|
+
|
2417
|
+
layer.w2 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, backend_split);
|
2418
|
+
layer.b2 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, backend_split);
|
2419
|
+
|
2420
|
+
layer.w3 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
|
2421
|
+
layer.b3 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, backend_split);
|
2422
|
+
|
2423
|
+
if (backend == GGML_BACKEND_GPU) {
|
2424
|
+
vram_weights +=
|
2425
|
+
ggml_nbytes(layer.attn_norm) + ggml_nbytes(layer.attn_norm_b) +
|
2426
|
+
ggml_nbytes(layer.wqkv) + ggml_nbytes(layer.bqkv) +
|
2427
|
+
ggml_nbytes(layer.wo) + ggml_nbytes(layer.bo) +
|
2428
|
+
ggml_nbytes(layer.ffn_norm) + ggml_nbytes(layer.ffn_norm_b) +
|
2429
|
+
ggml_nbytes(layer.w2) + ggml_nbytes(layer.b2) +
|
2430
|
+
ggml_nbytes(layer.w3) + ggml_nbytes(layer.b3);
|
2431
|
+
}
|
2432
|
+
}
|
2433
|
+
} break;
|
2163
2434
|
default:
|
2164
2435
|
throw std::runtime_error("unknown architecture");
|
2165
|
-
}
|
2436
|
+
}
|
2166
2437
|
}
|
2167
2438
|
|
2168
2439
|
ml.done_getting_tensors();
|
2169
2440
|
|
2170
2441
|
// print memory requirements
|
2171
2442
|
{
|
2172
|
-
const size_t scale = memory_type == GGML_TYPE_F32 ? 2 : 1;
|
2173
|
-
|
2174
2443
|
// this is the total memory required to run the inference
|
2175
2444
|
size_t mem_required =
|
2176
2445
|
ctx_size +
|
2177
2446
|
mmapped_size - vram_weights; // weights in VRAM not in memory
|
2178
2447
|
|
2179
|
-
|
2180
|
-
const size_t mem_required_state = scale*hparams.kv_size();
|
2181
|
-
|
2182
|
-
LLAMA_LOG_INFO("%s: mem required = %7.2f MB (+ %7.2f MB per state)\n", __func__,
|
2183
|
-
mem_required / 1024.0 / 1024.0, mem_required_state / 1024.0 / 1024.0);
|
2184
|
-
|
2185
|
-
(void) n_batch;
|
2448
|
+
LLAMA_LOG_INFO("%s: mem required = %7.2f MB\n", __func__, mem_required / 1024.0 / 1024.0);
|
2186
2449
|
|
2187
2450
|
#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
|
2188
2451
|
const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
|
@@ -2191,36 +2454,17 @@ static void llm_load_tensors(
|
|
2191
2454
|
if (n_gpu_layers > (int) hparams.n_layer) {
|
2192
2455
|
LLAMA_LOG_INFO("%s: offloading non-repeating layers to GPU\n", __func__);
|
2193
2456
|
}
|
2194
|
-
size_t vram_kv_cache = 0;
|
2195
2457
|
|
2196
2458
|
#ifdef GGML_USE_CUBLAS
|
2197
2459
|
const int max_backend_supported_layers = hparams.n_layer + 3;
|
2198
|
-
const int max_offloadable_layers =
|
2199
|
-
if (n_gpu_layers > (int) hparams.n_layer + 1) {
|
2200
|
-
if (low_vram) {
|
2201
|
-
LLAMA_LOG_INFO("%s: cannot offload v cache to GPU due to low VRAM option\n", __func__);
|
2202
|
-
} else {
|
2203
|
-
LLAMA_LOG_INFO("%s: offloading v cache to GPU\n", __func__);
|
2204
|
-
vram_kv_cache += hparams.kv_size() / 2;
|
2205
|
-
}
|
2206
|
-
}
|
2207
|
-
if (n_gpu_layers > (int) hparams.n_layer + 2) {
|
2208
|
-
if (low_vram) {
|
2209
|
-
LLAMA_LOG_WARN("%s: cannot offload k cache to GPU due to low VRAM option\n", __func__);
|
2210
|
-
} else {
|
2211
|
-
LLAMA_LOG_INFO("%s: offloading k cache to GPU\n", __func__);
|
2212
|
-
vram_kv_cache += hparams.kv_size() / 2;
|
2213
|
-
}
|
2214
|
-
}
|
2460
|
+
const int max_offloadable_layers = hparams.n_layer + 3;
|
2215
2461
|
#elif defined(GGML_USE_CLBLAST)
|
2216
2462
|
const int max_backend_supported_layers = hparams.n_layer + 1;
|
2217
2463
|
const int max_offloadable_layers = hparams.n_layer + 1;
|
2218
2464
|
#endif // GGML_USE_CUBLAS
|
2219
2465
|
|
2220
|
-
LLAMA_LOG_INFO("%s: offloaded %d/%d layers to GPU\n",
|
2221
|
-
|
2222
|
-
LLAMA_LOG_INFO("%s: VRAM used: %zu MB\n",
|
2223
|
-
__func__, (vram_weights + vram_kv_cache + MB - 1) / MB); // round up
|
2466
|
+
LLAMA_LOG_INFO("%s: offloaded %d/%d layers to GPU\n", __func__, std::min(n_gpu_layers, max_offloadable_layers), max_backend_supported_layers);
|
2467
|
+
LLAMA_LOG_INFO("%s: VRAM used: %.2f MB\n", __func__, vram_weights / 1024.0 / 1024.0);
|
2224
2468
|
#else
|
2225
2469
|
(void) n_gpu_layers;
|
2226
2470
|
#endif // defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
|
@@ -2233,7 +2477,7 @@ static void llm_load_tensors(
|
|
2233
2477
|
}
|
2234
2478
|
|
2235
2479
|
(void) tensor_split;
|
2236
|
-
#
|
2480
|
+
#ifdef GGML_USE_CUBLAS
|
2237
2481
|
{
|
2238
2482
|
ggml_cuda_set_tensor_split(tensor_split);
|
2239
2483
|
}
|
@@ -2255,29 +2499,24 @@ static void llm_load_tensors(
|
|
2255
2499
|
static bool llama_model_load(
|
2256
2500
|
const std::string & fname,
|
2257
2501
|
llama_model & model,
|
2258
|
-
int n_ctx,
|
2259
|
-
int n_batch,
|
2260
2502
|
int n_gpu_layers,
|
2261
2503
|
int main_gpu,
|
2262
2504
|
const float * tensor_split,
|
2263
|
-
const bool mul_mat_q,
|
2264
|
-
float rope_freq_base,
|
2265
|
-
float rope_freq_scale,
|
2266
|
-
bool low_vram,
|
2267
|
-
ggml_type memory_type,
|
2268
2505
|
bool use_mmap,
|
2269
2506
|
bool use_mlock,
|
2270
2507
|
bool vocab_only,
|
2271
2508
|
llama_progress_callback progress_callback,
|
2272
2509
|
void *progress_callback_user_data) {
|
2273
2510
|
try {
|
2274
|
-
|
2511
|
+
llama_model_loader ml(fname, use_mmap);
|
2512
|
+
|
2513
|
+
model.hparams.vocab_only = vocab_only;
|
2275
2514
|
|
2276
|
-
llm_load_arch (
|
2277
|
-
llm_load_hparams(
|
2278
|
-
llm_load_vocab (
|
2515
|
+
llm_load_arch (ml, model);
|
2516
|
+
llm_load_hparams(ml, model);
|
2517
|
+
llm_load_vocab (ml, model);
|
2279
2518
|
|
2280
|
-
llm_load_print_meta(
|
2519
|
+
llm_load_print_meta(ml, model);
|
2281
2520
|
|
2282
2521
|
if (model.hparams.n_vocab != model.vocab.id_to_token.size()) {
|
2283
2522
|
throw std::runtime_error("vocab size mismatch");
|
@@ -2289,8 +2528,8 @@ static bool llama_model_load(
|
|
2289
2528
|
}
|
2290
2529
|
|
2291
2530
|
llm_load_tensors(
|
2292
|
-
|
2293
|
-
main_gpu, tensor_split,
|
2531
|
+
ml, model, n_gpu_layers,
|
2532
|
+
main_gpu, tensor_split,
|
2294
2533
|
use_mlock, progress_callback, progress_callback_user_data);
|
2295
2534
|
} catch (const std::exception & err) {
|
2296
2535
|
LLAMA_LOG_ERROR("error loading model: %s\n", err.what());
|
@@ -2302,17 +2541,10 @@ static bool llama_model_load(
|
|
2302
2541
|
|
2303
2542
|
static struct ggml_cgraph * llm_build_llama(
|
2304
2543
|
llama_context & lctx,
|
2305
|
-
const
|
2306
|
-
const float * embd,
|
2307
|
-
int n_tokens,
|
2308
|
-
int n_past) {
|
2309
|
-
|
2310
|
-
GGML_ASSERT((!tokens && embd) || (tokens && !embd)); // NOLINT
|
2311
|
-
|
2312
|
-
const int N = n_tokens;
|
2313
|
-
|
2544
|
+
const llama_batch & batch) {
|
2314
2545
|
const auto & model = lctx.model;
|
2315
2546
|
const auto & hparams = model.hparams;
|
2547
|
+
const auto & cparams = lctx.cparams;
|
2316
2548
|
|
2317
2549
|
const auto & kv_self = lctx.kv_self;
|
2318
2550
|
|
@@ -2320,7 +2552,7 @@ static struct ggml_cgraph * llm_build_llama(
|
|
2320
2552
|
|
2321
2553
|
const int64_t n_embd = hparams.n_embd;
|
2322
2554
|
const int64_t n_layer = hparams.n_layer;
|
2323
|
-
const int64_t n_ctx =
|
2555
|
+
const int64_t n_ctx = cparams.n_ctx;
|
2324
2556
|
const int64_t n_head = hparams.n_head;
|
2325
2557
|
const int64_t n_head_kv = hparams.n_head_kv;
|
2326
2558
|
const int64_t n_embd_head = hparams.n_embd_head();
|
@@ -2328,12 +2560,20 @@ static struct ggml_cgraph * llm_build_llama(
|
|
2328
2560
|
|
2329
2561
|
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
2330
2562
|
|
2331
|
-
const float freq_base =
|
2332
|
-
const float freq_scale =
|
2563
|
+
const float freq_base = cparams.rope_freq_base;
|
2564
|
+
const float freq_scale = cparams.rope_freq_scale;
|
2333
2565
|
const float norm_rms_eps = hparams.f_norm_rms_eps;
|
2334
2566
|
|
2335
2567
|
const int n_gpu_layers = model.n_gpu_layers;
|
2336
2568
|
|
2569
|
+
const int32_t n_tokens = batch.n_tokens;
|
2570
|
+
const int32_t n_kv = ggml_allocr_is_measure(lctx.alloc) ? n_ctx : kv_self.n;
|
2571
|
+
const int32_t kv_head = ggml_allocr_is_measure(lctx.alloc) ? n_ctx - n_tokens : kv_self.head;
|
2572
|
+
|
2573
|
+
const bool do_rope_shift = ggml_allocr_is_measure(lctx.alloc) || kv_self.has_shift;
|
2574
|
+
|
2575
|
+
//printf("n_kv = %d\n", n_kv);
|
2576
|
+
|
2337
2577
|
auto & buf_compute = lctx.buf_compute;
|
2338
2578
|
|
2339
2579
|
struct ggml_init_params params = {
|
@@ -2351,12 +2591,12 @@ static struct ggml_cgraph * llm_build_llama(
|
|
2351
2591
|
struct ggml_tensor * cur;
|
2352
2592
|
struct ggml_tensor * inpL;
|
2353
2593
|
|
2354
|
-
if (
|
2355
|
-
struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32,
|
2594
|
+
if (batch.token) {
|
2595
|
+
struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
|
2356
2596
|
|
2357
2597
|
ggml_allocr_alloc(lctx.alloc, inp_tokens);
|
2358
2598
|
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
2359
|
-
memcpy(inp_tokens->data,
|
2599
|
+
memcpy(inp_tokens->data, batch.token, n_tokens*ggml_element_size(inp_tokens));
|
2360
2600
|
}
|
2361
2601
|
ggml_set_name(inp_tokens, "inp_tokens");
|
2362
2602
|
|
@@ -2366,11 +2606,11 @@ static struct ggml_cgraph * llm_build_llama(
|
|
2366
2606
|
GGML_ASSERT(false && "not implemented");
|
2367
2607
|
#endif
|
2368
2608
|
|
2369
|
-
inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd,
|
2609
|
+
inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_tokens);
|
2370
2610
|
|
2371
2611
|
ggml_allocr_alloc(lctx.alloc, inpL);
|
2372
2612
|
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
2373
|
-
memcpy(inpL->data, embd,
|
2613
|
+
memcpy(inpL->data, batch.embd, n_tokens * n_embd * ggml_element_size(inpL));
|
2374
2614
|
}
|
2375
2615
|
}
|
2376
2616
|
|
@@ -2379,9 +2619,6 @@ static struct ggml_cgraph * llm_build_llama(
|
|
2379
2619
|
|
2380
2620
|
// offload functions set the tensor output backend to GPU
|
2381
2621
|
// tensors are GPU-accelerated if any input or the output has been offloaded
|
2382
|
-
//
|
2383
|
-
// with the low VRAM option VRAM scratch is disabled in llama_load_model_internal
|
2384
|
-
// in that case ggml_cuda_assign_buffers has no effect
|
2385
2622
|
offload_func_t offload_func_nr = llama_nop; // nr = non-repeating
|
2386
2623
|
offload_func_t offload_func_kq = llama_nop;
|
2387
2624
|
offload_func_t offload_func_v = llama_nop;
|
@@ -2398,12 +2635,75 @@ static struct ggml_cgraph * llm_build_llama(
|
|
2398
2635
|
}
|
2399
2636
|
#endif // GGML_USE_CUBLAS
|
2400
2637
|
|
2638
|
+
// KQ_scale
|
2401
2639
|
struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
|
2640
|
+
ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)");
|
2402
2641
|
ggml_allocr_alloc(lctx.alloc, KQ_scale);
|
2403
2642
|
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
2404
|
-
ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(
|
2643
|
+
ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd_head)));
|
2644
|
+
}
|
2645
|
+
|
2646
|
+
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
2647
|
+
struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
|
2648
|
+
offload_func_kq(KQ_mask);
|
2649
|
+
ggml_set_name(KQ_mask, "KQ_mask");
|
2650
|
+
ggml_allocr_alloc(lctx.alloc, KQ_mask);
|
2651
|
+
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
2652
|
+
float * data = (float *) KQ_mask->data;
|
2653
|
+
memset(data, 0, ggml_nbytes(KQ_mask));
|
2654
|
+
|
2655
|
+
for (int h = 0; h < 1; ++h) {
|
2656
|
+
for (int j = 0; j < n_tokens; ++j) {
|
2657
|
+
const llama_pos pos = batch.pos[j];
|
2658
|
+
const llama_seq_id seq_id = batch.seq_id[j];
|
2659
|
+
|
2660
|
+
for (int i = 0; i < n_kv; ++i) {
|
2661
|
+
if (!kv_self.cells[i].has_seq_id(seq_id) || kv_self.cells[i].pos > pos) {
|
2662
|
+
data[h*(n_kv*n_tokens) + j*n_kv + i] = -INFINITY;
|
2663
|
+
}
|
2664
|
+
}
|
2665
|
+
}
|
2666
|
+
}
|
2667
|
+
}
|
2668
|
+
|
2669
|
+
// KQ_pos - contains the positions
|
2670
|
+
struct ggml_tensor * KQ_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
|
2671
|
+
offload_func_kq(KQ_pos);
|
2672
|
+
ggml_set_name(KQ_pos, "KQ_pos");
|
2673
|
+
ggml_allocr_alloc(lctx.alloc, KQ_pos);
|
2674
|
+
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
2675
|
+
int * data = (int *) KQ_pos->data;
|
2676
|
+
for (int i = 0; i < n_tokens; ++i) {
|
2677
|
+
data[i] = batch.pos[i];
|
2678
|
+
}
|
2679
|
+
}
|
2680
|
+
|
2681
|
+
// shift the entire K-cache if needed
|
2682
|
+
if (do_rope_shift) {
|
2683
|
+
struct ggml_tensor * K_shift = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_ctx);
|
2684
|
+
offload_func_kq(K_shift);
|
2685
|
+
ggml_set_name(K_shift, "K_shift");
|
2686
|
+
ggml_allocr_alloc(lctx.alloc, K_shift);
|
2687
|
+
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
2688
|
+
int * data = (int *) K_shift->data;
|
2689
|
+
for (int i = 0; i < n_ctx; ++i) {
|
2690
|
+
data[i] = kv_self.cells[i].delta;
|
2691
|
+
}
|
2692
|
+
}
|
2693
|
+
|
2694
|
+
for (int il = 0; il < n_layer; ++il) {
|
2695
|
+
struct ggml_tensor * tmp =
|
2696
|
+
ggml_rope_custom_inplace(ctx0,
|
2697
|
+
ggml_view_3d(ctx0, kv_self.k,
|
2698
|
+
n_embd_head, n_head_kv, n_ctx,
|
2699
|
+
ggml_element_size(kv_self.k)*n_embd_head,
|
2700
|
+
ggml_element_size(kv_self.k)*n_embd_gqa,
|
2701
|
+
ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il),
|
2702
|
+
K_shift, n_embd_head, 0, 0, freq_base, freq_scale);
|
2703
|
+
offload_func_kq(tmp);
|
2704
|
+
ggml_build_forward_expand(gf, tmp);
|
2705
|
+
}
|
2405
2706
|
}
|
2406
|
-
ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)");
|
2407
2707
|
|
2408
2708
|
for (int il = 0; il < n_layer; ++il) {
|
2409
2709
|
ggml_format_name(inpL, "layer_inp_%d", il);
|
@@ -2441,33 +2741,33 @@ static struct ggml_cgraph * llm_build_llama(
|
|
2441
2741
|
offload_func_kq(tmpq);
|
2442
2742
|
ggml_set_name(tmpq, "tmpq");
|
2443
2743
|
|
2444
|
-
struct ggml_tensor * Kcur =
|
2744
|
+
struct ggml_tensor * Kcur = ggml_rope_custom(ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd_head, n_head_kv, n_tokens), KQ_pos, n_embd_head, 0, 0, freq_base, freq_scale);
|
2445
2745
|
offload_func_kq(Kcur);
|
2446
2746
|
ggml_set_name(Kcur, "Kcur");
|
2447
2747
|
|
2448
|
-
struct ggml_tensor * Qcur =
|
2748
|
+
struct ggml_tensor * Qcur = ggml_rope_custom(ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head, n_tokens), KQ_pos, n_embd_head, 0, 0, freq_base, freq_scale);
|
2449
2749
|
offload_func_kq(Qcur);
|
2450
2750
|
ggml_set_name(Qcur, "Qcur");
|
2451
2751
|
|
2452
2752
|
// store key and value to memory
|
2453
2753
|
{
|
2454
|
-
// compute the transposed [
|
2754
|
+
// compute the transposed [n_tokens, n_embd] V matrix
|
2455
2755
|
|
2456
2756
|
struct ggml_tensor * tmpv = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
|
2457
2757
|
offload_func_v(tmpv);
|
2458
2758
|
ggml_set_name(tmpv, "tmpv");
|
2459
2759
|
|
2460
|
-
struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, tmpv, n_embd_gqa,
|
2760
|
+
struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, tmpv, n_embd_gqa, n_tokens));
|
2461
2761
|
offload_func_v(Vcur);
|
2462
2762
|
ggml_set_name(Vcur, "Vcur");
|
2463
2763
|
|
2464
|
-
struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k,
|
2764
|
+
struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, n_tokens*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + kv_head));
|
2465
2765
|
offload_func_kq(k);
|
2466
2766
|
ggml_set_name(k, "k");
|
2467
2767
|
|
2468
|
-
struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v,
|
2768
|
+
struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, n_tokens, n_embd_gqa,
|
2469
2769
|
( n_ctx)*ggml_element_size(kv_self.v),
|
2470
|
-
(il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa +
|
2770
|
+
(il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + kv_head*ggml_element_size(kv_self.v));
|
2471
2771
|
offload_func_v(v);
|
2472
2772
|
ggml_set_name(v, "v");
|
2473
2773
|
|
@@ -2482,7 +2782,7 @@ static struct ggml_cgraph * llm_build_llama(
|
|
2482
2782
|
|
2483
2783
|
struct ggml_tensor * K =
|
2484
2784
|
ggml_view_3d(ctx0, kv_self.k,
|
2485
|
-
n_embd_head,
|
2785
|
+
n_embd_head, n_kv, n_head_kv,
|
2486
2786
|
ggml_element_size(kv_self.k)*n_embd_gqa,
|
2487
2787
|
ggml_element_size(kv_self.k)*n_embd_head,
|
2488
2788
|
ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il);
|
@@ -2495,25 +2795,25 @@ static struct ggml_cgraph * llm_build_llama(
|
|
2495
2795
|
ggml_set_name(KQ, "KQ");
|
2496
2796
|
|
2497
2797
|
// KQ_scaled = KQ / sqrt(n_embd_head)
|
2498
|
-
// KQ_scaled shape [
|
2499
|
-
struct ggml_tensor * KQ_scaled =
|
2798
|
+
// KQ_scaled shape [n_kv, n_tokens, n_head, 1]
|
2799
|
+
struct ggml_tensor * KQ_scaled = ggml_scale(ctx0, KQ, KQ_scale);
|
2500
2800
|
offload_func_kq(KQ_scaled);
|
2501
2801
|
ggml_set_name(KQ_scaled, "KQ_scaled");
|
2502
2802
|
|
2503
2803
|
// KQ_masked = mask_past(KQ_scaled)
|
2504
|
-
struct ggml_tensor * KQ_masked =
|
2804
|
+
struct ggml_tensor * KQ_masked = ggml_add(ctx0, KQ_scaled, KQ_mask);
|
2505
2805
|
offload_func_kq(KQ_masked);
|
2506
2806
|
ggml_set_name(KQ_masked, "KQ_masked");
|
2507
2807
|
|
2508
2808
|
// KQ = soft_max(KQ_masked)
|
2509
|
-
struct ggml_tensor * KQ_soft_max =
|
2809
|
+
struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked);
|
2510
2810
|
offload_func_v(KQ_soft_max);
|
2511
2811
|
ggml_set_name(KQ_soft_max, "KQ_soft_max");
|
2512
2812
|
|
2513
2813
|
// split cached V into n_head heads
|
2514
2814
|
struct ggml_tensor * V =
|
2515
2815
|
ggml_view_3d(ctx0, kv_self.v,
|
2516
|
-
|
2816
|
+
n_kv, n_embd_head, n_head_kv,
|
2517
2817
|
ggml_element_size(kv_self.v)*n_ctx,
|
2518
2818
|
ggml_element_size(kv_self.v)*n_ctx*n_embd_head,
|
2519
2819
|
ggml_element_size(kv_self.v)*n_ctx*n_embd_gqa*il);
|
@@ -2528,7 +2828,7 @@ static struct ggml_cgraph * llm_build_llama(
|
|
2528
2828
|
// make V contiguous in memory to speed up the matmul, however we waste time on the copy
|
2529
2829
|
// on M1 this is faster for the perplexity computation, but ~5% slower for the single-token generation
|
2530
2830
|
// is there a better way?
|
2531
|
-
struct ggml_tensor * V_cont = ggml_cpy(ctx0, V, ggml_new_tensor_3d(ctx0, kv_self.v->type,
|
2831
|
+
struct ggml_tensor * V_cont = ggml_cpy(ctx0, V, ggml_new_tensor_3d(ctx0, kv_self.v->type, n_ctx, n_embd_head, n_head));
|
2532
2832
|
struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V_cont, KQ_soft_max);
|
2533
2833
|
#endif
|
2534
2834
|
|
@@ -2537,10 +2837,8 @@ static struct ggml_cgraph * llm_build_llama(
|
|
2537
2837
|
offload_func_v(KQV_merged);
|
2538
2838
|
ggml_set_name(KQV_merged, "KQV_merged");
|
2539
2839
|
|
2540
|
-
// cur = KQV_merged.contiguous().view(n_embd,
|
2541
|
-
cur =
|
2542
|
-
KQV_merged,
|
2543
|
-
ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N));
|
2840
|
+
// cur = KQV_merged.contiguous().view(n_embd, n_tokens)
|
2841
|
+
cur = ggml_cont_2d(ctx0, KQV_merged, n_embd, n_tokens);
|
2544
2842
|
offload_func_v(cur);
|
2545
2843
|
ggml_set_name(cur, "KQV_merged_contiguous");
|
2546
2844
|
|
@@ -2631,20 +2929,12 @@ static struct ggml_cgraph * llm_build_llama(
|
|
2631
2929
|
return gf;
|
2632
2930
|
}
|
2633
2931
|
|
2634
|
-
|
2635
2932
|
static struct ggml_cgraph * llm_build_baichaun(
|
2636
2933
|
llama_context & lctx,
|
2637
|
-
const
|
2638
|
-
const float * embd,
|
2639
|
-
int n_tokens,
|
2640
|
-
int n_past) {
|
2641
|
-
|
2642
|
-
GGML_ASSERT((!tokens && embd) || (tokens && !embd)); // NOLINT
|
2643
|
-
|
2644
|
-
const int N = n_tokens;
|
2645
|
-
|
2934
|
+
const llama_batch & batch) {
|
2646
2935
|
const auto & model = lctx.model;
|
2647
2936
|
const auto & hparams = model.hparams;
|
2937
|
+
const auto & cparams = lctx.cparams;
|
2648
2938
|
|
2649
2939
|
const auto & kv_self = lctx.kv_self;
|
2650
2940
|
|
@@ -2652,7 +2942,7 @@ static struct ggml_cgraph * llm_build_baichaun(
|
|
2652
2942
|
|
2653
2943
|
const int64_t n_embd = hparams.n_embd;
|
2654
2944
|
const int64_t n_layer = hparams.n_layer;
|
2655
|
-
const int64_t n_ctx =
|
2945
|
+
const int64_t n_ctx = cparams.n_ctx;
|
2656
2946
|
const int64_t n_head = hparams.n_head;
|
2657
2947
|
const int64_t n_head_kv = hparams.n_head_kv;
|
2658
2948
|
const int64_t n_embd_head = hparams.n_embd_head();
|
@@ -2660,12 +2950,18 @@ static struct ggml_cgraph * llm_build_baichaun(
|
|
2660
2950
|
|
2661
2951
|
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
2662
2952
|
|
2663
|
-
const float freq_base =
|
2664
|
-
const float freq_scale =
|
2953
|
+
const float freq_base = cparams.rope_freq_base;
|
2954
|
+
const float freq_scale = cparams.rope_freq_scale;
|
2665
2955
|
const float norm_rms_eps = hparams.f_norm_rms_eps;
|
2666
2956
|
|
2667
2957
|
const int n_gpu_layers = model.n_gpu_layers;
|
2668
2958
|
|
2959
|
+
const int32_t n_tokens = batch.n_tokens;
|
2960
|
+
const int32_t n_kv = ggml_allocr_is_measure(lctx.alloc) ? n_ctx : kv_self.n;
|
2961
|
+
const int32_t kv_head = ggml_allocr_is_measure(lctx.alloc) ? n_ctx - n_tokens : kv_self.head;
|
2962
|
+
|
2963
|
+
const bool do_rope_shift = ggml_allocr_is_measure(lctx.alloc) || kv_self.has_shift;
|
2964
|
+
|
2669
2965
|
auto & buf_compute = lctx.buf_compute;
|
2670
2966
|
|
2671
2967
|
struct ggml_init_params params = {
|
@@ -2683,12 +2979,12 @@ static struct ggml_cgraph * llm_build_baichaun(
|
|
2683
2979
|
struct ggml_tensor * cur;
|
2684
2980
|
struct ggml_tensor * inpL;
|
2685
2981
|
|
2686
|
-
if (
|
2687
|
-
struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32,
|
2982
|
+
if (batch.token) {
|
2983
|
+
struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
|
2688
2984
|
|
2689
2985
|
ggml_allocr_alloc(lctx.alloc, inp_tokens);
|
2690
2986
|
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
2691
|
-
memcpy(inp_tokens->data,
|
2987
|
+
memcpy(inp_tokens->data, batch.token, n_tokens*ggml_element_size(inp_tokens));
|
2692
2988
|
}
|
2693
2989
|
ggml_set_name(inp_tokens, "inp_tokens");
|
2694
2990
|
|
@@ -2698,11 +2994,11 @@ static struct ggml_cgraph * llm_build_baichaun(
|
|
2698
2994
|
GGML_ASSERT(false && "not implemented");
|
2699
2995
|
#endif
|
2700
2996
|
|
2701
|
-
inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd,
|
2997
|
+
inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_tokens);
|
2702
2998
|
|
2703
2999
|
ggml_allocr_alloc(lctx.alloc, inpL);
|
2704
3000
|
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
2705
|
-
memcpy(inpL->data, embd,
|
3001
|
+
memcpy(inpL->data, batch.embd, n_tokens * n_embd * ggml_element_size(inpL));
|
2706
3002
|
}
|
2707
3003
|
}
|
2708
3004
|
|
@@ -2711,9 +3007,6 @@ static struct ggml_cgraph * llm_build_baichaun(
|
|
2711
3007
|
|
2712
3008
|
// offload functions set the tensor output backend to GPU
|
2713
3009
|
// tensors are GPU-accelerated if any input or the output has been offloaded
|
2714
|
-
//
|
2715
|
-
// with the low VRAM option VRAM scratch is disabled in llama_load_model_internal
|
2716
|
-
// in that case ggml_cuda_assign_buffers has no effect
|
2717
3010
|
offload_func_t offload_func_nr = llama_nop; // nr = non-repeating
|
2718
3011
|
offload_func_t offload_func_kq = llama_nop;
|
2719
3012
|
offload_func_t offload_func_v = llama_nop;
|
@@ -2730,12 +3023,75 @@ static struct ggml_cgraph * llm_build_baichaun(
|
|
2730
3023
|
}
|
2731
3024
|
#endif // GGML_USE_CUBLAS
|
2732
3025
|
|
3026
|
+
// KQ_scale
|
2733
3027
|
struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
|
3028
|
+
ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)");
|
2734
3029
|
ggml_allocr_alloc(lctx.alloc, KQ_scale);
|
2735
3030
|
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
2736
3031
|
ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd)/n_head));
|
2737
3032
|
}
|
2738
|
-
|
3033
|
+
|
3034
|
+
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
3035
|
+
struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
|
3036
|
+
offload_func_kq(KQ_mask);
|
3037
|
+
ggml_set_name(KQ_mask, "KQ_mask");
|
3038
|
+
ggml_allocr_alloc(lctx.alloc, KQ_mask);
|
3039
|
+
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
3040
|
+
float * data = (float *) KQ_mask->data;
|
3041
|
+
memset(data, 0, ggml_nbytes(KQ_mask));
|
3042
|
+
|
3043
|
+
for (int h = 0; h < 1; ++h) {
|
3044
|
+
for (int j = 0; j < n_tokens; ++j) {
|
3045
|
+
const llama_pos pos = batch.pos[j];
|
3046
|
+
const llama_seq_id seq_id = batch.seq_id[j];
|
3047
|
+
|
3048
|
+
for (int i = 0; i < n_kv; ++i) {
|
3049
|
+
if (!kv_self.cells[i].has_seq_id(seq_id) || kv_self.cells[i].pos > pos) {
|
3050
|
+
data[h*(n_kv*n_tokens) + j*n_kv + i] = -INFINITY;
|
3051
|
+
}
|
3052
|
+
}
|
3053
|
+
}
|
3054
|
+
}
|
3055
|
+
}
|
3056
|
+
|
3057
|
+
// KQ_pos - contains the positions
|
3058
|
+
struct ggml_tensor * KQ_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
|
3059
|
+
offload_func_kq(KQ_pos);
|
3060
|
+
ggml_set_name(KQ_pos, "KQ_pos");
|
3061
|
+
ggml_allocr_alloc(lctx.alloc, KQ_pos);
|
3062
|
+
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
3063
|
+
int * data = (int *) KQ_pos->data;
|
3064
|
+
for (int i = 0; i < n_tokens; ++i) {
|
3065
|
+
data[i] = batch.pos[i];
|
3066
|
+
}
|
3067
|
+
}
|
3068
|
+
|
3069
|
+
// shift the entire K-cache if needed
|
3070
|
+
if (do_rope_shift) {
|
3071
|
+
struct ggml_tensor * K_shift = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_ctx);
|
3072
|
+
offload_func_kq(K_shift);
|
3073
|
+
ggml_set_name(K_shift, "K_shift");
|
3074
|
+
ggml_allocr_alloc(lctx.alloc, K_shift);
|
3075
|
+
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
3076
|
+
int * data = (int *) K_shift->data;
|
3077
|
+
for (int i = 0; i < n_ctx; ++i) {
|
3078
|
+
data[i] = kv_self.cells[i].delta;
|
3079
|
+
}
|
3080
|
+
}
|
3081
|
+
|
3082
|
+
for (int il = 0; il < n_layer; ++il) {
|
3083
|
+
struct ggml_tensor * tmp =
|
3084
|
+
ggml_rope_custom_inplace(ctx0,
|
3085
|
+
ggml_view_3d(ctx0, kv_self.k,
|
3086
|
+
n_embd_head, n_head_kv, n_ctx,
|
3087
|
+
ggml_element_size(kv_self.k)*n_embd_head,
|
3088
|
+
ggml_element_size(kv_self.k)*n_embd_gqa,
|
3089
|
+
ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il),
|
3090
|
+
K_shift, n_embd_head, 0, 0, freq_base, freq_scale);
|
3091
|
+
offload_func_kq(tmp);
|
3092
|
+
ggml_build_forward_expand(gf, tmp);
|
3093
|
+
}
|
3094
|
+
}
|
2739
3095
|
|
2740
3096
|
for (int il = 0; il < n_layer; ++il) {
|
2741
3097
|
ggml_format_name(inpL, "layer_inp_%d", il);
|
@@ -2777,12 +3133,12 @@ static struct ggml_cgraph * llm_build_baichaun(
|
|
2777
3133
|
struct ggml_tensor * Qcur;
|
2778
3134
|
switch (model.type) {
|
2779
3135
|
case MODEL_7B:
|
2780
|
-
Kcur =
|
2781
|
-
Qcur =
|
3136
|
+
Kcur = ggml_rope_custom(ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd_head, n_head_kv, n_tokens), KQ_pos, n_embd_head, 0, 0, freq_base, freq_scale);
|
3137
|
+
Qcur = ggml_rope_custom(ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head, n_tokens), KQ_pos, n_embd_head, 0, 0, freq_base, freq_scale);
|
2782
3138
|
break;
|
2783
3139
|
case MODEL_13B:
|
2784
|
-
Kcur
|
2785
|
-
Qcur = ggml_reshape_3d(ctx0, tmpq, n_embd/n_head, n_head,
|
3140
|
+
Kcur = ggml_reshape_3d(ctx0, tmpk, n_embd/n_head, n_head, n_tokens);
|
3141
|
+
Qcur = ggml_reshape_3d(ctx0, tmpq, n_embd/n_head, n_head, n_tokens);
|
2786
3142
|
break;
|
2787
3143
|
default:
|
2788
3144
|
GGML_ASSERT(false);
|
@@ -2796,23 +3152,23 @@ static struct ggml_cgraph * llm_build_baichaun(
|
|
2796
3152
|
|
2797
3153
|
// store key and value to memory
|
2798
3154
|
{
|
2799
|
-
// compute the transposed [
|
3155
|
+
// compute the transposed [n_tokens, n_embd] V matrix
|
2800
3156
|
|
2801
3157
|
struct ggml_tensor * tmpv = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
|
2802
3158
|
offload_func_v(tmpv);
|
2803
3159
|
ggml_set_name(tmpv, "tmpv");
|
2804
3160
|
|
2805
|
-
struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, tmpv, n_embd_gqa,
|
3161
|
+
struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, tmpv, n_embd_gqa, n_tokens));
|
2806
3162
|
offload_func_v(Vcur);
|
2807
3163
|
ggml_set_name(Vcur, "Vcur");
|
2808
3164
|
|
2809
|
-
struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k,
|
3165
|
+
struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, n_tokens*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + kv_head));
|
2810
3166
|
offload_func_kq(k);
|
2811
3167
|
ggml_set_name(k, "k");
|
2812
3168
|
|
2813
|
-
struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v,
|
3169
|
+
struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, n_tokens, n_embd_gqa,
|
2814
3170
|
( n_ctx)*ggml_element_size(kv_self.v),
|
2815
|
-
(il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa +
|
3171
|
+
(il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + kv_head*ggml_element_size(kv_self.v));
|
2816
3172
|
offload_func_v(v);
|
2817
3173
|
ggml_set_name(v, "v");
|
2818
3174
|
|
@@ -2827,7 +3183,7 @@ static struct ggml_cgraph * llm_build_baichaun(
|
|
2827
3183
|
|
2828
3184
|
struct ggml_tensor * K =
|
2829
3185
|
ggml_view_3d(ctx0, kv_self.k,
|
2830
|
-
n_embd_head,
|
3186
|
+
n_embd_head, n_kv, n_head_kv,
|
2831
3187
|
ggml_element_size(kv_self.k)*n_embd_gqa,
|
2832
3188
|
ggml_element_size(kv_self.k)*n_embd_head,
|
2833
3189
|
ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il);
|
@@ -2840,8 +3196,8 @@ static struct ggml_cgraph * llm_build_baichaun(
|
|
2840
3196
|
ggml_set_name(KQ, "KQ");
|
2841
3197
|
|
2842
3198
|
// KQ_scaled = KQ / sqrt(n_embd_head)
|
2843
|
-
// KQ_scaled shape [n_past +
|
2844
|
-
struct ggml_tensor * KQ_scaled =
|
3199
|
+
// KQ_scaled shape [n_past + n_tokens, n_tokens, n_head, 1]
|
3200
|
+
struct ggml_tensor * KQ_scaled = ggml_scale(ctx0, KQ, KQ_scale);
|
2845
3201
|
offload_func_kq(KQ_scaled);
|
2846
3202
|
ggml_set_name(KQ_scaled, "KQ_scaled");
|
2847
3203
|
|
@@ -2850,58 +3206,44 @@ static struct ggml_cgraph * llm_build_baichaun(
|
|
2850
3206
|
|
2851
3207
|
switch (model.type) {
|
2852
3208
|
case MODEL_7B:
|
2853
|
-
KQ_masked =
|
3209
|
+
KQ_masked = ggml_add(ctx0, KQ_scaled, KQ_mask);
|
2854
3210
|
break;
|
2855
3211
|
case MODEL_13B:
|
2856
|
-
|
3212
|
+
// TODO: replace with ggml_add()
|
3213
|
+
KQ_scaled_alibi = ggml_alibi(ctx0, KQ_scaled, /*n_past*/ 0, n_head, 8);
|
2857
3214
|
ggml_set_name(KQ_scaled_alibi, "KQ_scaled_alibi");
|
2858
|
-
KQ_masked =
|
3215
|
+
KQ_masked = ggml_add(ctx0, KQ_scaled_alibi, KQ_mask);
|
2859
3216
|
break;
|
2860
3217
|
default:
|
2861
3218
|
GGML_ASSERT(false);
|
2862
3219
|
}
|
2863
|
-
// KQ_masked = mask_past(KQ_scaled)
|
2864
|
-
// struct ggml_tensor * KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past);
|
2865
|
-
// struct ggml_tensor * KQ_masked = ggml_diag_mask_inf(ctx0, KQ_scaled_alibi, n_past);
|
2866
|
-
// offload_func_kq(KQ_masked);
|
2867
|
-
// ggml_set_name(KQ_masked, "KQ_masked");
|
2868
3220
|
|
2869
3221
|
// KQ = soft_max(KQ_masked)
|
2870
|
-
struct ggml_tensor * KQ_soft_max =
|
3222
|
+
struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked);
|
2871
3223
|
offload_func_v(KQ_soft_max);
|
2872
3224
|
ggml_set_name(KQ_soft_max, "KQ_soft_max");
|
2873
3225
|
|
2874
3226
|
// split cached V into n_head heads
|
2875
3227
|
struct ggml_tensor * V =
|
2876
3228
|
ggml_view_3d(ctx0, kv_self.v,
|
2877
|
-
|
3229
|
+
n_kv, n_embd_head, n_head_kv,
|
2878
3230
|
ggml_element_size(kv_self.v)*n_ctx,
|
2879
3231
|
ggml_element_size(kv_self.v)*n_ctx*n_embd_head,
|
2880
3232
|
ggml_element_size(kv_self.v)*n_ctx*n_embd_gqa*il);
|
2881
3233
|
offload_func_v(V);
|
2882
3234
|
ggml_set_name(V, "V");
|
2883
3235
|
|
2884
|
-
#if 1
|
2885
3236
|
struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
|
2886
3237
|
offload_func_v(KQV);
|
2887
3238
|
ggml_set_name(KQV, "KQV");
|
2888
|
-
#else
|
2889
|
-
// make V contiguous in memory to speed up the matmul, however we waste time on the copy
|
2890
|
-
// on M1 this is faster for the perplexity computation, but ~5% slower for the single-token generation
|
2891
|
-
// is there a better way?
|
2892
|
-
struct ggml_tensor * V_cont = ggml_cpy(ctx0, V, ggml_new_tensor_3d(ctx0, kv_self.v->type, n_past + N, n_embd_head, n_head));
|
2893
|
-
struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V_cont, KQ_soft_max);
|
2894
|
-
#endif
|
2895
3239
|
|
2896
3240
|
// KQV_merged = KQV.permute(0, 2, 1, 3)
|
2897
3241
|
struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
|
2898
3242
|
offload_func_v(KQV_merged);
|
2899
3243
|
ggml_set_name(KQV_merged, "KQV_merged");
|
2900
3244
|
|
2901
|
-
// cur = KQV_merged.contiguous().view(n_embd,
|
2902
|
-
cur =
|
2903
|
-
KQV_merged,
|
2904
|
-
ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N));
|
3245
|
+
// cur = KQV_merged.contiguous().view(n_embd, n_tokens)
|
3246
|
+
cur = ggml_cont_2d(ctx0, KQV_merged, n_embd, n_tokens);
|
2905
3247
|
offload_func_v(cur);
|
2906
3248
|
ggml_set_name(cur, "KQV_merged_contiguous");
|
2907
3249
|
|
@@ -2994,17 +3336,10 @@ static struct ggml_cgraph * llm_build_baichaun(
|
|
2994
3336
|
|
2995
3337
|
static struct ggml_cgraph * llm_build_falcon(
|
2996
3338
|
llama_context & lctx,
|
2997
|
-
const
|
2998
|
-
const float * embd,
|
2999
|
-
int n_tokens,
|
3000
|
-
int n_past) {
|
3001
|
-
|
3002
|
-
GGML_ASSERT((!tokens && embd) || (tokens && !embd)); // NOLINT
|
3003
|
-
|
3004
|
-
const int N = n_tokens;
|
3005
|
-
|
3339
|
+
const llama_batch & batch) {
|
3006
3340
|
const auto & model = lctx.model;
|
3007
3341
|
const auto & hparams = model.hparams;
|
3342
|
+
const auto & cparams = lctx.cparams;
|
3008
3343
|
|
3009
3344
|
const auto & kv_self = lctx.kv_self;
|
3010
3345
|
|
@@ -3012,7 +3347,7 @@ static struct ggml_cgraph * llm_build_falcon(
|
|
3012
3347
|
|
3013
3348
|
const int64_t n_embd = hparams.n_embd;
|
3014
3349
|
const int64_t n_layer = hparams.n_layer;
|
3015
|
-
const int64_t n_ctx =
|
3350
|
+
const int64_t n_ctx = cparams.n_ctx;
|
3016
3351
|
const int64_t n_head = hparams.n_head;
|
3017
3352
|
const int64_t n_head_kv = hparams.n_head_kv;
|
3018
3353
|
const int64_t n_embd_head = hparams.n_embd_head();
|
@@ -3020,12 +3355,21 @@ static struct ggml_cgraph * llm_build_falcon(
|
|
3020
3355
|
|
3021
3356
|
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
3022
3357
|
|
3023
|
-
const float freq_base =
|
3024
|
-
const float freq_scale =
|
3358
|
+
const float freq_base = cparams.rope_freq_base;
|
3359
|
+
const float freq_scale = cparams.rope_freq_scale;
|
3025
3360
|
const float norm_eps = hparams.f_norm_eps;
|
3026
3361
|
|
3027
3362
|
const int n_gpu_layers = model.n_gpu_layers;
|
3028
3363
|
|
3364
|
+
const int32_t n_tokens = batch.n_tokens;
|
3365
|
+
const int32_t n_kv = ggml_allocr_is_measure(lctx.alloc) ? n_ctx : kv_self.n;
|
3366
|
+
const int32_t kv_head = ggml_allocr_is_measure(lctx.alloc) ? n_ctx - n_tokens : kv_self.head;
|
3367
|
+
|
3368
|
+
const bool do_rope_shift = ggml_allocr_is_measure(lctx.alloc) || kv_self.has_shift;
|
3369
|
+
|
3370
|
+
//printf("kv_head = %d, n_kv = %d, n_tokens = %d, n_ctx = %d, is_measure = %d, has_shift = %d\n",
|
3371
|
+
// kv_head, n_kv, n_tokens, n_ctx, ggml_allocr_is_measure(lctx.alloc), kv_self.has_shift);
|
3372
|
+
|
3029
3373
|
auto & buf_compute = lctx.buf_compute;
|
3030
3374
|
|
3031
3375
|
struct ggml_init_params params = {
|
@@ -3043,12 +3387,12 @@ static struct ggml_cgraph * llm_build_falcon(
|
|
3043
3387
|
struct ggml_tensor * cur;
|
3044
3388
|
struct ggml_tensor * inpL;
|
3045
3389
|
|
3046
|
-
if (
|
3047
|
-
struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32,
|
3390
|
+
if (batch.token) {
|
3391
|
+
struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
|
3048
3392
|
|
3049
3393
|
ggml_allocr_alloc(lctx.alloc, inp_tokens);
|
3050
3394
|
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
3051
|
-
memcpy(inp_tokens->data,
|
3395
|
+
memcpy(inp_tokens->data, batch.token, n_tokens*ggml_element_size(inp_tokens));
|
3052
3396
|
}
|
3053
3397
|
ggml_set_name(inp_tokens, "inp_tokens");
|
3054
3398
|
|
@@ -3058,11 +3402,11 @@ static struct ggml_cgraph * llm_build_falcon(
|
|
3058
3402
|
GGML_ASSERT(false && "not implemented");
|
3059
3403
|
#endif
|
3060
3404
|
|
3061
|
-
inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd,
|
3405
|
+
inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_tokens);
|
3062
3406
|
|
3063
3407
|
ggml_allocr_alloc(lctx.alloc, inpL);
|
3064
3408
|
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
3065
|
-
memcpy(inpL->data, embd,
|
3409
|
+
memcpy(inpL->data, batch.embd, n_tokens * n_embd * ggml_element_size(inpL));
|
3066
3410
|
}
|
3067
3411
|
}
|
3068
3412
|
|
@@ -3071,9 +3415,6 @@ static struct ggml_cgraph * llm_build_falcon(
|
|
3071
3415
|
|
3072
3416
|
// offload functions set the tensor output backend to GPU
|
3073
3417
|
// tensors are GPU-accelerated if any input or the output has been offloaded
|
3074
|
-
//
|
3075
|
-
// with the low VRAM option VRAM scratch is disabled in llama_load_model_internal
|
3076
|
-
// in that case ggml_cuda_assign_buffers has no effect
|
3077
3418
|
offload_func_t offload_func_nr = llama_nop; // nr = non-repeating
|
3078
3419
|
offload_func_t offload_func_kq = llama_nop;
|
3079
3420
|
offload_func_t offload_func_v = llama_nop;
|
@@ -3090,12 +3431,75 @@ static struct ggml_cgraph * llm_build_falcon(
|
|
3090
3431
|
}
|
3091
3432
|
#endif // GGML_USE_CUBLAS
|
3092
3433
|
|
3434
|
+
// KQ_scale
|
3093
3435
|
struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
|
3436
|
+
ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)");
|
3094
3437
|
ggml_allocr_alloc(lctx.alloc, KQ_scale);
|
3095
3438
|
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
3096
3439
|
ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd)/n_head));
|
3097
3440
|
}
|
3098
|
-
|
3441
|
+
|
3442
|
+
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
3443
|
+
struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
|
3444
|
+
offload_func_kq(KQ_mask);
|
3445
|
+
ggml_set_name(KQ_mask, "KQ_mask");
|
3446
|
+
ggml_allocr_alloc(lctx.alloc, KQ_mask);
|
3447
|
+
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
3448
|
+
float * data = (float *) KQ_mask->data;
|
3449
|
+
memset(data, 0, ggml_nbytes(KQ_mask));
|
3450
|
+
|
3451
|
+
for (int h = 0; h < 1; ++h) {
|
3452
|
+
for (int j = 0; j < n_tokens; ++j) {
|
3453
|
+
const llama_pos pos = batch.pos[j];
|
3454
|
+
const llama_seq_id seq_id = batch.seq_id[j];
|
3455
|
+
|
3456
|
+
for (int i = 0; i < n_kv; ++i) {
|
3457
|
+
if (!kv_self.cells[i].has_seq_id(seq_id) || kv_self.cells[i].pos > pos) {
|
3458
|
+
data[h*(n_kv*n_tokens) + j*n_kv + i] = -INFINITY;
|
3459
|
+
}
|
3460
|
+
}
|
3461
|
+
}
|
3462
|
+
}
|
3463
|
+
}
|
3464
|
+
|
3465
|
+
// KQ_pos - contains the positions
|
3466
|
+
struct ggml_tensor * KQ_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
|
3467
|
+
offload_func_kq(KQ_pos);
|
3468
|
+
ggml_set_name(KQ_pos, "KQ_pos");
|
3469
|
+
ggml_allocr_alloc(lctx.alloc, KQ_pos);
|
3470
|
+
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
3471
|
+
int * data = (int *) KQ_pos->data;
|
3472
|
+
for (int i = 0; i < n_tokens; ++i) {
|
3473
|
+
data[i] = batch.pos[i];
|
3474
|
+
}
|
3475
|
+
}
|
3476
|
+
|
3477
|
+
// shift the entire K-cache if needed
|
3478
|
+
if (do_rope_shift) {
|
3479
|
+
struct ggml_tensor * K_shift = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_ctx);
|
3480
|
+
offload_func_kq(K_shift);
|
3481
|
+
ggml_set_name(K_shift, "K_shift");
|
3482
|
+
ggml_allocr_alloc(lctx.alloc, K_shift);
|
3483
|
+
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
3484
|
+
int * data = (int *) K_shift->data;
|
3485
|
+
for (int i = 0; i < n_ctx; ++i) {
|
3486
|
+
data[i] = kv_self.cells[i].delta;
|
3487
|
+
}
|
3488
|
+
}
|
3489
|
+
|
3490
|
+
for (int il = 0; il < n_layer; ++il) {
|
3491
|
+
struct ggml_tensor * tmp =
|
3492
|
+
ggml_rope_custom_inplace(ctx0,
|
3493
|
+
ggml_view_3d(ctx0, kv_self.k,
|
3494
|
+
n_embd_head, n_head_kv, n_ctx,
|
3495
|
+
ggml_element_size(kv_self.k)*n_embd_head,
|
3496
|
+
ggml_element_size(kv_self.k)*n_embd_gqa,
|
3497
|
+
ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il),
|
3498
|
+
K_shift, n_embd_head, 2, 0, freq_base, freq_scale);
|
3499
|
+
offload_func_kq(tmp);
|
3500
|
+
ggml_build_forward_expand(gf, tmp);
|
3501
|
+
}
|
3502
|
+
}
|
3099
3503
|
|
3100
3504
|
for (int il = 0; il < n_layer; ++il) {
|
3101
3505
|
struct ggml_tensor * attn_norm;
|
@@ -3152,148 +3556,395 @@ static struct ggml_cgraph * llm_build_falcon(
|
|
3152
3556
|
// TODO: these 2 ggml_conts are technically not needed, but we add them until CUDA support for
|
3153
3557
|
// non-contiguous views is added for the rope operator
|
3154
3558
|
struct ggml_tensor * tmpq = ggml_cont(ctx0, ggml_view_3d(
|
3155
|
-
ctx0, cur, n_embd_head, n_head,
|
3559
|
+
ctx0, cur, n_embd_head, n_head, n_tokens,
|
3156
3560
|
wsize * n_embd_head,
|
3157
3561
|
wsize * n_embd_head * (n_head + 2 * n_head_kv),
|
3158
3562
|
0));
|
3159
3563
|
offload_func_kq(tmpq);
|
3160
3564
|
|
3161
|
-
struct ggml_tensor * tmpk = ggml_cont(ctx0, ggml_view_3d(
|
3162
|
-
ctx0, cur, n_embd_head, n_head_kv,
|
3163
|
-
wsize * n_embd_head,
|
3164
|
-
wsize * n_embd_head * (n_head + 2 * n_head_kv),
|
3165
|
-
wsize * n_embd_head * n_head));
|
3166
|
-
offload_func_kq(tmpk);
|
3565
|
+
struct ggml_tensor * tmpk = ggml_cont(ctx0, ggml_view_3d(
|
3566
|
+
ctx0, cur, n_embd_head, n_head_kv, n_tokens,
|
3567
|
+
wsize * n_embd_head,
|
3568
|
+
wsize * n_embd_head * (n_head + 2 * n_head_kv),
|
3569
|
+
wsize * n_embd_head * n_head));
|
3570
|
+
offload_func_kq(tmpk);
|
3571
|
+
|
3572
|
+
struct ggml_tensor * tmpv = ggml_view_3d(
|
3573
|
+
ctx0, cur, n_embd_head, n_head_kv, n_tokens,
|
3574
|
+
wsize * n_embd_head,
|
3575
|
+
wsize * n_embd_head * (n_head + 2 * n_head_kv),
|
3576
|
+
wsize * n_embd_head * (n_head + n_head_kv));
|
3577
|
+
offload_func_v(tmpv);
|
3578
|
+
|
3579
|
+
// using mode = 2 for neox mode
|
3580
|
+
struct ggml_tensor * Qcur = ggml_rope_custom(ctx0, tmpq, KQ_pos, n_embd_head, 2, 0, freq_base, freq_scale);
|
3581
|
+
offload_func_kq(Qcur);
|
3582
|
+
struct ggml_tensor * Kcur = ggml_rope_custom(ctx0, tmpk, KQ_pos, n_embd_head, 2, 0, freq_base, freq_scale);
|
3583
|
+
offload_func_kq(Kcur);
|
3584
|
+
|
3585
|
+
{
|
3586
|
+
struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, ggml_cont(ctx0, tmpv), n_embd_gqa, n_tokens));
|
3587
|
+
offload_func_v(Vcur);
|
3588
|
+
offload_func_v(Vcur->src[0]->src[0]);
|
3589
|
+
ggml_set_name(Vcur, "Vcur");
|
3590
|
+
|
3591
|
+
struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, n_tokens*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + kv_head));
|
3592
|
+
offload_func_kq(k);
|
3593
|
+
ggml_set_name(k, "k");
|
3594
|
+
|
3595
|
+
struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, n_tokens, n_embd_gqa,
|
3596
|
+
( n_ctx)*ggml_element_size(kv_self.v),
|
3597
|
+
(il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + kv_head*ggml_element_size(kv_self.v));
|
3598
|
+
offload_func_v(v);
|
3599
|
+
|
3600
|
+
ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k));
|
3601
|
+
ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v));
|
3602
|
+
}
|
3603
|
+
|
3604
|
+
struct ggml_tensor * Q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3);
|
3605
|
+
offload_func_kq(Q);
|
3606
|
+
ggml_set_name(Q, "Q");
|
3607
|
+
|
3608
|
+
struct ggml_tensor * K =
|
3609
|
+
ggml_view_3d(ctx0, kv_self.k,
|
3610
|
+
n_embd_head, n_kv, n_head_kv,
|
3611
|
+
ggml_element_size(kv_self.k)*n_embd_gqa,
|
3612
|
+
ggml_element_size(kv_self.k)*n_embd_head,
|
3613
|
+
ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il);
|
3614
|
+
offload_func_kq(K);
|
3615
|
+
ggml_set_name(K, "K");
|
3616
|
+
|
3617
|
+
struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
|
3618
|
+
offload_func_kq(KQ);
|
3619
|
+
ggml_set_name(KQ, "KQ");
|
3620
|
+
|
3621
|
+
struct ggml_tensor * KQ_scaled = ggml_scale(ctx0, KQ, KQ_scale);
|
3622
|
+
offload_func_kq(KQ_scaled);
|
3623
|
+
ggml_set_name(KQ_scaled, "KQ_scaled");
|
3624
|
+
|
3625
|
+
struct ggml_tensor * KQ_masked = ggml_add(ctx0, KQ_scaled, KQ_mask);
|
3626
|
+
offload_func_kq(KQ_masked);
|
3627
|
+
ggml_set_name(KQ_masked, "KQ_masked");
|
3628
|
+
|
3629
|
+
struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked);
|
3630
|
+
offload_func_v(KQ_soft_max);
|
3631
|
+
ggml_set_name(KQ_soft_max, "KQ_soft_max");
|
3632
|
+
|
3633
|
+
struct ggml_tensor * V =
|
3634
|
+
ggml_view_3d(ctx0, kv_self.v,
|
3635
|
+
n_kv, n_embd_head, n_head_kv,
|
3636
|
+
ggml_element_size(kv_self.v)*n_ctx,
|
3637
|
+
ggml_element_size(kv_self.v)*n_ctx*n_embd_head,
|
3638
|
+
ggml_element_size(kv_self.v)*n_ctx*n_embd_gqa*il);
|
3639
|
+
offload_func_v(V);
|
3640
|
+
ggml_set_name(V, "V");
|
3641
|
+
|
3642
|
+
struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
|
3643
|
+
offload_func_v(KQV);
|
3644
|
+
ggml_set_name(KQV, "KQV");
|
3645
|
+
|
3646
|
+
struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
|
3647
|
+
offload_func_v(KQV_merged);
|
3648
|
+
ggml_set_name(KQV_merged, "KQV_merged");
|
3649
|
+
|
3650
|
+
cur = ggml_cont_2d(ctx0, KQV_merged, n_embd, n_tokens);
|
3651
|
+
offload_func_v(cur);
|
3652
|
+
ggml_set_name(cur, "KQV_merged_contiguous");
|
3653
|
+
|
3654
|
+
cur = ggml_mul_mat(ctx0, model.layers[il].wo, cur);
|
3655
|
+
offload_func(cur);
|
3656
|
+
ggml_set_name(cur, "result_wo");
|
3657
|
+
}
|
3658
|
+
|
3659
|
+
struct ggml_tensor * attn_out = cur;
|
3660
|
+
|
3661
|
+
// feed forward
|
3662
|
+
{
|
3663
|
+
struct ggml_tensor * inpFF = attn_norm;
|
3664
|
+
|
3665
|
+
cur = ggml_mul_mat(ctx0, model.layers[il].w3, inpFF);
|
3666
|
+
offload_func(cur);
|
3667
|
+
|
3668
|
+
cur = ggml_gelu(ctx0, cur);
|
3669
|
+
offload_func(cur);
|
3670
|
+
cur = ggml_mul_mat(ctx0, model.layers[il].w2, cur);
|
3671
|
+
offload_func(cur);
|
3672
|
+
}
|
3673
|
+
|
3674
|
+
cur = ggml_add(ctx0, cur, attn_out);
|
3675
|
+
offload_func(cur);
|
3676
|
+
cur = ggml_add(ctx0, cur, inpL);
|
3677
|
+
offload_func(cur);
|
3678
|
+
|
3679
|
+
// input for next layer
|
3680
|
+
inpL = cur;
|
3681
|
+
}
|
3682
|
+
|
3683
|
+
cur = inpL;
|
3684
|
+
|
3685
|
+
// norm
|
3686
|
+
{
|
3687
|
+
cur = ggml_norm(ctx0, cur, norm_eps);
|
3688
|
+
offload_func_nr(cur);
|
3689
|
+
|
3690
|
+
cur = ggml_add(ctx0,
|
3691
|
+
ggml_mul(ctx0, cur, model.output_norm),
|
3692
|
+
model.output_norm_b);
|
3693
|
+
ggml_set_name(cur, "result_norm");
|
3694
|
+
}
|
3695
|
+
|
3696
|
+
cur = ggml_mul_mat(ctx0, model.output, cur);
|
3697
|
+
ggml_set_name(cur, "result_output");
|
3698
|
+
|
3699
|
+
ggml_build_forward_expand(gf, cur);
|
3700
|
+
|
3701
|
+
ggml_free(ctx0);
|
3702
|
+
|
3703
|
+
return gf;
|
3704
|
+
}
|
3705
|
+
|
3706
|
+
static struct ggml_cgraph * llm_build_starcoder(
|
3707
|
+
llama_context & lctx,
|
3708
|
+
const llama_batch & batch) {
|
3709
|
+
const auto & model = lctx.model;
|
3710
|
+
const auto & hparams = model.hparams;
|
3711
|
+
const auto & cparams = lctx.cparams;
|
3712
|
+
|
3713
|
+
const auto & kv_self = lctx.kv_self;
|
3714
|
+
|
3715
|
+
GGML_ASSERT(!!kv_self.ctx);
|
3716
|
+
|
3717
|
+
const int64_t n_embd = hparams.n_embd;
|
3718
|
+
const int64_t n_layer = hparams.n_layer;
|
3719
|
+
const int64_t n_ctx = cparams.n_ctx;
|
3720
|
+
const int64_t n_head = hparams.n_head;
|
3721
|
+
const int64_t n_head_kv = hparams.n_head_kv;
|
3722
|
+
const int64_t n_embd_head = hparams.n_embd_head();
|
3723
|
+
const int64_t n_embd_gqa = hparams.n_embd_gqa();
|
3724
|
+
|
3725
|
+
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
3726
|
+
|
3727
|
+
const float norm_eps = hparams.f_norm_eps;
|
3728
|
+
|
3729
|
+
const int32_t n_tokens = batch.n_tokens;
|
3730
|
+
const int32_t n_kv = ggml_allocr_is_measure(lctx.alloc) ? n_ctx : kv_self.n;
|
3731
|
+
const int32_t kv_head = ggml_allocr_is_measure(lctx.alloc) ? n_ctx - n_tokens : kv_self.head;
|
3732
|
+
|
3733
|
+
auto & buf_compute = lctx.buf_compute;
|
3734
|
+
|
3735
|
+
struct ggml_init_params params = {
|
3736
|
+
/*.mem_size =*/ buf_compute.size,
|
3737
|
+
/*.mem_buffer =*/ buf_compute.data,
|
3738
|
+
/*.no_alloc =*/ false,
|
3739
|
+
};
|
3740
|
+
|
3741
|
+
params.no_alloc = true;
|
3742
|
+
|
3743
|
+
struct ggml_context * ctx0 = ggml_init(params);
|
3744
|
+
|
3745
|
+
ggml_cgraph * gf = ggml_new_graph(ctx0);
|
3746
|
+
|
3747
|
+
struct ggml_tensor * cur;
|
3748
|
+
struct ggml_tensor * token;
|
3749
|
+
struct ggml_tensor * position;
|
3750
|
+
struct ggml_tensor * inpL;
|
3751
|
+
|
3752
|
+
if (batch.token) {
|
3753
|
+
struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
|
3754
|
+
|
3755
|
+
ggml_allocr_alloc(lctx.alloc, inp_tokens);
|
3756
|
+
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
3757
|
+
memcpy(inp_tokens->data, batch.token, n_tokens*ggml_element_size(inp_tokens));
|
3758
|
+
}
|
3759
|
+
ggml_set_name(inp_tokens, "inp_tokens");
|
3760
|
+
|
3761
|
+
token = ggml_get_rows(ctx0, model.tok_embeddings, inp_tokens);
|
3762
|
+
} else {
|
3763
|
+
#ifdef GGML_USE_MPI
|
3764
|
+
GGML_ASSERT(false && "not implemented");
|
3765
|
+
#endif
|
3766
|
+
|
3767
|
+
token = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_tokens);
|
3768
|
+
|
3769
|
+
ggml_allocr_alloc(lctx.alloc, token);
|
3770
|
+
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
3771
|
+
memcpy(token->data, batch.embd, n_tokens * n_embd * ggml_element_size(token));
|
3772
|
+
}
|
3773
|
+
}
|
3774
|
+
|
3775
|
+
{
|
3776
|
+
// Compute position embeddings.
|
3777
|
+
struct ggml_tensor * inp_positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
|
3778
|
+
ggml_allocr_alloc(lctx.alloc, inp_positions);
|
3779
|
+
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
3780
|
+
for (int i = 0; i < n_tokens; ++i) {
|
3781
|
+
((int32_t *) inp_positions->data)[i] = batch.pos[i];
|
3782
|
+
}
|
3783
|
+
}
|
3784
|
+
ggml_set_name(inp_positions, "inp_positions");
|
3785
|
+
|
3786
|
+
position = ggml_get_rows(ctx0, model.pos_embeddings, inp_positions);
|
3787
|
+
}
|
3788
|
+
|
3789
|
+
// KQ_scale
|
3790
|
+
struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
|
3791
|
+
ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)");
|
3792
|
+
ggml_allocr_alloc(lctx.alloc, KQ_scale);
|
3793
|
+
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
3794
|
+
ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd)/n_head));
|
3795
|
+
}
|
3796
|
+
|
3797
|
+
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
3798
|
+
struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
|
3799
|
+
ggml_set_name(KQ_mask, "KQ_mask");
|
3800
|
+
ggml_allocr_alloc(lctx.alloc, KQ_mask);
|
3801
|
+
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
3802
|
+
float * data = (float *) KQ_mask->data;
|
3803
|
+
memset(data, 0, ggml_nbytes(KQ_mask));
|
3804
|
+
|
3805
|
+
for (int h = 0; h < 1; ++h) {
|
3806
|
+
for (int j = 0; j < n_tokens; ++j) {
|
3807
|
+
const llama_pos pos = batch.pos[j];
|
3808
|
+
const llama_seq_id seq_id = batch.seq_id[j];
|
3809
|
+
|
3810
|
+
for (int i = 0; i < n_kv; ++i) {
|
3811
|
+
if (!kv_self.cells[i].has_seq_id(seq_id) || kv_self.cells[i].pos > pos) {
|
3812
|
+
data[h*(n_kv*n_tokens) + j*n_kv + i] = -INFINITY;
|
3813
|
+
}
|
3814
|
+
}
|
3815
|
+
}
|
3816
|
+
}
|
3817
|
+
}
|
3818
|
+
|
3819
|
+
inpL = ggml_add(ctx0, token, position);
|
3820
|
+
ggml_set_name(inpL, "inpL");
|
3167
3821
|
|
3168
|
-
|
3169
|
-
|
3170
|
-
|
3171
|
-
|
3172
|
-
|
3173
|
-
|
3822
|
+
for (int il = 0; il < n_layer; ++il) {
|
3823
|
+
{
|
3824
|
+
// Norm
|
3825
|
+
cur = ggml_norm(ctx0, inpL, norm_eps);
|
3826
|
+
cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.layers[il].attn_norm), model.layers[il].attn_norm_b);
|
3827
|
+
}
|
3174
3828
|
|
3175
|
-
|
3176
|
-
|
3177
|
-
|
3178
|
-
|
3179
|
-
|
3829
|
+
{
|
3830
|
+
// Self Attention
|
3831
|
+
cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wqkv, cur), model.layers[il].bqkv);
|
3832
|
+
|
3833
|
+
struct ggml_tensor * tmpq = ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*n_embd);
|
3834
|
+
struct ggml_tensor * tmpk = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], sizeof(float)*n_embd);
|
3835
|
+
struct ggml_tensor * tmpv = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], sizeof(float)*(n_embd + n_embd_gqa));
|
3836
|
+
|
3837
|
+
struct ggml_tensor * Qcur = tmpq;
|
3838
|
+
struct ggml_tensor * Kcur = tmpk;
|
3180
3839
|
|
3181
3840
|
{
|
3182
|
-
struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, ggml_cont(ctx0, tmpv), n_embd_gqa,
|
3183
|
-
offload_func_v(Vcur);
|
3184
|
-
offload_func_v(Vcur->src[0]->src[0]);
|
3841
|
+
struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, ggml_cont(ctx0, tmpv), n_embd_gqa, n_tokens));
|
3185
3842
|
ggml_set_name(Vcur, "Vcur");
|
3186
3843
|
|
3187
|
-
struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k,
|
3188
|
-
offload_func_kq(k);
|
3844
|
+
struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, n_tokens*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + kv_head));
|
3189
3845
|
ggml_set_name(k, "k");
|
3190
3846
|
|
3191
|
-
struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v,
|
3847
|
+
struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, n_tokens, n_embd_gqa,
|
3192
3848
|
( n_ctx)*ggml_element_size(kv_self.v),
|
3193
|
-
(il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa +
|
3194
|
-
offload_func_v(v);
|
3849
|
+
(il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + kv_head*ggml_element_size(kv_self.v));
|
3195
3850
|
|
3196
3851
|
ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k));
|
3197
3852
|
ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v));
|
3198
3853
|
}
|
3199
3854
|
|
3200
|
-
struct ggml_tensor * Q =
|
3201
|
-
|
3855
|
+
struct ggml_tensor * Q =
|
3856
|
+
ggml_permute(ctx0,
|
3857
|
+
ggml_cpy(ctx0,
|
3858
|
+
Qcur,
|
3859
|
+
ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_embd_head, n_head, n_tokens)),
|
3860
|
+
0, 2, 1, 3);
|
3202
3861
|
ggml_set_name(Q, "Q");
|
3203
3862
|
|
3204
3863
|
struct ggml_tensor * K =
|
3205
3864
|
ggml_view_3d(ctx0, kv_self.k,
|
3206
|
-
n_embd_head,
|
3865
|
+
n_embd_head, n_kv, n_head_kv,
|
3207
3866
|
ggml_element_size(kv_self.k)*n_embd_gqa,
|
3208
3867
|
ggml_element_size(kv_self.k)*n_embd_head,
|
3209
3868
|
ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il);
|
3210
|
-
offload_func_kq(K);
|
3211
3869
|
ggml_set_name(K, "K");
|
3212
3870
|
|
3871
|
+
// K * Q
|
3213
3872
|
struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
|
3214
|
-
offload_func_kq(KQ);
|
3215
3873
|
ggml_set_name(KQ, "KQ");
|
3216
3874
|
|
3875
|
+
// KQ_scaled = KQ / sqrt(n_embd_head)
|
3876
|
+
// KQ_scaled shape [n_past + n_tokens, n_tokens, n_head, 1]
|
3217
3877
|
struct ggml_tensor * KQ_scaled = ggml_scale_inplace(ctx0, KQ, KQ_scale);
|
3218
|
-
offload_func_kq(KQ_scaled);
|
3219
3878
|
ggml_set_name(KQ_scaled, "KQ_scaled");
|
3220
3879
|
|
3221
|
-
|
3222
|
-
|
3880
|
+
// KQ_masked = mask_past(KQ_scaled)
|
3881
|
+
struct ggml_tensor * KQ_masked = ggml_add(ctx0, KQ_scaled, KQ_mask);
|
3223
3882
|
ggml_set_name(KQ_masked, "KQ_masked");
|
3224
3883
|
|
3884
|
+
// KQ = soft_max(KQ_masked)
|
3225
3885
|
struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked);
|
3226
|
-
offload_func_v(KQ_soft_max);
|
3227
3886
|
ggml_set_name(KQ_soft_max, "KQ_soft_max");
|
3228
3887
|
|
3888
|
+
// split cached V into n_head heads
|
3229
3889
|
struct ggml_tensor * V =
|
3230
3890
|
ggml_view_3d(ctx0, kv_self.v,
|
3231
|
-
|
3891
|
+
n_kv, n_embd_head, n_head_kv,
|
3232
3892
|
ggml_element_size(kv_self.v)*n_ctx,
|
3233
3893
|
ggml_element_size(kv_self.v)*n_ctx*n_embd_head,
|
3234
3894
|
ggml_element_size(kv_self.v)*n_ctx*n_embd_gqa*il);
|
3235
|
-
offload_func_v(V);
|
3236
3895
|
ggml_set_name(V, "V");
|
3237
3896
|
|
3238
3897
|
struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
|
3239
|
-
offload_func_v(KQV);
|
3240
3898
|
ggml_set_name(KQV, "KQV");
|
3241
3899
|
|
3900
|
+
// KQV_merged = KQV.permute(0, 2, 1, 3)
|
3242
3901
|
struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
|
3243
|
-
offload_func_v(KQV_merged);
|
3244
3902
|
ggml_set_name(KQV_merged, "KQV_merged");
|
3245
3903
|
|
3246
|
-
cur =
|
3247
|
-
|
3904
|
+
// cur = KQV_merged.contiguous().view(n_embd, n_tokens)
|
3905
|
+
cur = ggml_cont_2d(ctx0, KQV_merged, n_embd, n_tokens);
|
3248
3906
|
ggml_set_name(cur, "KQV_merged_contiguous");
|
3249
|
-
|
3250
|
-
cur = ggml_mul_mat(ctx0, model.layers[il].wo, cur);
|
3251
|
-
offload_func(cur);
|
3252
|
-
ggml_set_name(cur, "result_wo");
|
3253
3907
|
}
|
3254
3908
|
|
3255
|
-
|
3909
|
+
// Projection
|
3910
|
+
cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wo, cur), model.layers[il].bo);
|
3256
3911
|
|
3257
|
-
//
|
3912
|
+
// Add the input
|
3913
|
+
cur = ggml_add(ctx0, cur, inpL);
|
3914
|
+
|
3915
|
+
struct ggml_tensor * inpFF = cur;
|
3916
|
+
|
3917
|
+
// FF
|
3258
3918
|
{
|
3259
|
-
|
3919
|
+
// Norm
|
3920
|
+
{
|
3921
|
+
cur = ggml_norm(ctx0, inpFF, norm_eps);
|
3922
|
+
cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.layers[il].ffn_norm), model.layers[il].ffn_norm_b);
|
3923
|
+
}
|
3260
3924
|
|
3261
|
-
cur = ggml_mul_mat(ctx0, model.layers[il].w3,
|
3262
|
-
offload_func(cur);
|
3925
|
+
cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].w3, cur), model.layers[il].b3);
|
3263
3926
|
|
3927
|
+
// GELU activation
|
3264
3928
|
cur = ggml_gelu(ctx0, cur);
|
3265
|
-
offload_func(cur);
|
3266
|
-
cur = ggml_mul_mat(ctx0, model.layers[il].w2, cur);
|
3267
|
-
offload_func(cur);
|
3268
|
-
}
|
3269
3929
|
|
3270
|
-
|
3271
|
-
|
3272
|
-
|
3273
|
-
offload_func(cur);
|
3930
|
+
// Projection
|
3931
|
+
cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].w2, cur), model.layers[il].b2);
|
3932
|
+
}
|
3274
3933
|
|
3275
|
-
|
3276
|
-
inpL = cur;
|
3934
|
+
inpL = ggml_add(ctx0, cur, inpFF);
|
3277
3935
|
}
|
3278
3936
|
|
3279
|
-
|
3280
|
-
|
3281
|
-
// norm
|
3937
|
+
// Output Norm
|
3282
3938
|
{
|
3283
|
-
cur = ggml_norm(ctx0,
|
3284
|
-
|
3285
|
-
|
3286
|
-
cur = ggml_add(ctx0,
|
3287
|
-
ggml_mul(ctx0, cur, model.output_norm),
|
3288
|
-
model.output_norm_b);
|
3289
|
-
ggml_set_name(cur, "result_norm");
|
3939
|
+
cur = ggml_norm(ctx0, inpL, norm_eps);
|
3940
|
+
cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.output_norm), model.output_norm_b);
|
3290
3941
|
}
|
3942
|
+
ggml_set_name(cur, "result_norm");
|
3291
3943
|
|
3292
3944
|
cur = ggml_mul_mat(ctx0, model.output, cur);
|
3293
3945
|
ggml_set_name(cur, "result_output");
|
3294
3946
|
|
3295
3947
|
ggml_build_forward_expand(gf, cur);
|
3296
|
-
|
3297
3948
|
ggml_free(ctx0);
|
3298
3949
|
|
3299
3950
|
return gf;
|
@@ -3301,10 +3952,7 @@ static struct ggml_cgraph * llm_build_falcon(
|
|
3301
3952
|
|
3302
3953
|
static struct ggml_cgraph * llama_build_graph(
|
3303
3954
|
llama_context & lctx,
|
3304
|
-
const
|
3305
|
-
const float * embd,
|
3306
|
-
int n_tokens,
|
3307
|
-
int n_past) {
|
3955
|
+
const llama_batch & batch) {
|
3308
3956
|
const auto & model = lctx.model;
|
3309
3957
|
|
3310
3958
|
struct ggml_cgraph * result = NULL;
|
@@ -3312,72 +3960,117 @@ static struct ggml_cgraph * llama_build_graph(
|
|
3312
3960
|
switch (model.arch) {
|
3313
3961
|
case LLM_ARCH_LLAMA:
|
3314
3962
|
{
|
3315
|
-
result = llm_build_llama(lctx,
|
3963
|
+
result = llm_build_llama(lctx, batch);
|
3316
3964
|
} break;
|
3317
3965
|
case LLM_ARCH_BAICHUAN:
|
3318
3966
|
{
|
3319
|
-
result = llm_build_baichaun(lctx,
|
3967
|
+
result = llm_build_baichaun(lctx, batch);
|
3320
3968
|
} break;
|
3321
3969
|
case LLM_ARCH_FALCON:
|
3322
3970
|
{
|
3323
|
-
result = llm_build_falcon(lctx,
|
3971
|
+
result = llm_build_falcon(lctx, batch);
|
3972
|
+
} break;
|
3973
|
+
case LLM_ARCH_STARCODER:
|
3974
|
+
{
|
3975
|
+
result = llm_build_starcoder(lctx, batch);
|
3324
3976
|
} break;
|
3325
3977
|
default:
|
3326
3978
|
GGML_ASSERT(false);
|
3327
|
-
}
|
3979
|
+
}
|
3328
3980
|
|
3329
3981
|
return result;
|
3330
3982
|
}
|
3331
3983
|
|
3332
|
-
//
|
3984
|
+
// decode a batch of tokens by evaluating the transformer
|
3333
3985
|
//
|
3334
3986
|
// - lctx: llama context
|
3335
|
-
// -
|
3336
|
-
// - embd embeddings input
|
3337
|
-
// - n_tokens number of tokens
|
3338
|
-
// - n_past: the context size so far
|
3987
|
+
// - batch: batch to evaluate
|
3339
3988
|
// - n_threads: number of threads to use
|
3340
3989
|
//
|
3341
|
-
|
3990
|
+
// return 0 on success
|
3991
|
+
// return positive int on warning
|
3992
|
+
// return negative int on error
|
3993
|
+
//
|
3994
|
+
static int llama_decode_internal(
|
3342
3995
|
llama_context & lctx,
|
3343
|
-
|
3344
|
-
|
3345
|
-
|
3346
|
-
|
3347
|
-
|
3348
|
-
|
3996
|
+
llama_batch batch) {
|
3997
|
+
const uint32_t n_tokens = batch.n_tokens;
|
3998
|
+
|
3999
|
+
if (n_tokens == 0) {
|
4000
|
+
LLAMA_LOG_ERROR("%s: n_tokens == 0", __func__);
|
4001
|
+
return -1;
|
4002
|
+
}
|
4003
|
+
|
4004
|
+
const auto & model = lctx.model;
|
4005
|
+
const auto & hparams = model.hparams;
|
4006
|
+
const auto & cparams = lctx.cparams;
|
3349
4007
|
|
3350
|
-
|
4008
|
+
const auto n_batch = cparams.n_batch;
|
3351
4009
|
|
3352
|
-
GGML_ASSERT(n_tokens
|
3353
|
-
|
3354
|
-
|
3355
|
-
|
3356
|
-
// GGML_ASSERT(n_past + n_tokens <= n_ctx);
|
4010
|
+
GGML_ASSERT(n_tokens <= n_batch);
|
4011
|
+
|
4012
|
+
int n_threads = n_tokens == 1 ? cparams.n_threads : cparams.n_threads_batch;
|
4013
|
+
GGML_ASSERT((!batch.token && batch.embd) || (batch.token && !batch.embd)); // NOLINT
|
3357
4014
|
|
3358
4015
|
const int64_t t_start_us = ggml_time_us();
|
3359
4016
|
|
3360
4017
|
#ifdef GGML_USE_MPI
|
3361
|
-
|
4018
|
+
// TODO: needs fix after #3228
|
4019
|
+
GGML_ASSERT(false && "not implemented");
|
4020
|
+
//ggml_mpi_eval_init(lctx.ctx_mpi, &n_tokens, &n_past, &n_threads);
|
3362
4021
|
#endif
|
3363
4022
|
|
3364
4023
|
GGML_ASSERT(n_threads > 0);
|
3365
4024
|
|
3366
|
-
|
3367
|
-
|
3368
|
-
const auto & model = lctx.model;
|
3369
|
-
const auto & hparams = model.hparams;
|
3370
|
-
|
3371
|
-
const auto & kv_self = lctx.kv_self;
|
4025
|
+
auto & kv_self = lctx.kv_self;
|
3372
4026
|
|
3373
4027
|
GGML_ASSERT(!!kv_self.ctx);
|
3374
4028
|
|
3375
4029
|
const int64_t n_embd = hparams.n_embd;
|
3376
4030
|
const int64_t n_vocab = hparams.n_vocab;
|
3377
4031
|
|
4032
|
+
// helpers for smoother batch API transistion
|
4033
|
+
// after deprecating the llama_eval calls, these will be removed
|
4034
|
+
std::vector<llama_pos> pos;
|
4035
|
+
std::vector<llama_seq_id> seq_id;
|
4036
|
+
|
4037
|
+
if (batch.pos == nullptr) {
|
4038
|
+
pos.resize(n_tokens);
|
4039
|
+
for (uint32_t i = 0; i < n_tokens; i++) {
|
4040
|
+
pos[i] = batch.all_pos_0 + i*batch.all_pos_1;
|
4041
|
+
}
|
4042
|
+
|
4043
|
+
batch.pos = pos.data();
|
4044
|
+
}
|
4045
|
+
|
4046
|
+
if (batch.seq_id == nullptr) {
|
4047
|
+
seq_id.resize(n_tokens);
|
4048
|
+
for (uint32_t i = 0; i < n_tokens; i++) {
|
4049
|
+
seq_id[i] = batch.all_seq_id;
|
4050
|
+
}
|
4051
|
+
|
4052
|
+
batch.seq_id = seq_id.data();
|
4053
|
+
}
|
4054
|
+
|
4055
|
+
// we always start to search for a free slot from the start of the cache
|
4056
|
+
// TODO: better strategies can be implemented
|
4057
|
+
kv_self.head = 0;
|
4058
|
+
|
4059
|
+
if (!llama_kv_cache_find_slot(kv_self, batch)) {
|
4060
|
+
return 1;
|
4061
|
+
}
|
4062
|
+
|
4063
|
+
// a heuristic, to avoid attending the full cache if it is not yet utilized
|
4064
|
+
// after enough generations, the benefit from this heuristic disappears
|
4065
|
+
// if we start defragmenting the cache, the benefit from this will be more important
|
4066
|
+
//kv_self.n = std::max(32, GGML_PAD(llama_kv_cache_cell_max(kv_self), 32)); // TODO: this might be better for CUDA?
|
4067
|
+
kv_self.n = std::min((int32_t) cparams.n_ctx, std::max(32, llama_kv_cache_cell_max(kv_self)));
|
4068
|
+
|
4069
|
+
//printf("kv_self.n = %d\n", kv_self.n);
|
4070
|
+
|
3378
4071
|
ggml_allocr_reset(lctx.alloc);
|
3379
4072
|
|
3380
|
-
ggml_cgraph * gf = llama_build_graph(lctx,
|
4073
|
+
ggml_cgraph * gf = llama_build_graph(lctx, batch);
|
3381
4074
|
|
3382
4075
|
ggml_allocr_alloc_graph(lctx.alloc, gf);
|
3383
4076
|
|
@@ -3386,6 +4079,7 @@ static bool llama_eval_internal(
|
|
3386
4079
|
ggml_tensor * node = gf->leafs[i];
|
3387
4080
|
if (node->backend == GGML_BACKEND_GPU && node->extra == NULL) {
|
3388
4081
|
ggml_cuda_assign_scratch_offset(node, (char*)node->data - (char *) lctx.buf_alloc.data);
|
4082
|
+
ggml_cuda_copy_to_device(node);
|
3389
4083
|
}
|
3390
4084
|
}
|
3391
4085
|
|
@@ -3395,6 +4089,8 @@ static bool llama_eval_internal(
|
|
3395
4089
|
ggml_cuda_assign_scratch_offset(node, (char*)node->data - (char *) lctx.buf_alloc.data);
|
3396
4090
|
}
|
3397
4091
|
}
|
4092
|
+
|
4093
|
+
ggml_cuda_set_mul_mat_q(cparams.mul_mat_q);
|
3398
4094
|
#endif
|
3399
4095
|
|
3400
4096
|
// LLAMA_LOG_INFO("graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs);
|
@@ -3404,10 +4100,19 @@ static bool llama_eval_internal(
|
|
3404
4100
|
// TODO: this is mostly important for Apple Silicon where CBLAS is still performing very well
|
3405
4101
|
// we still need some threads to process all non-mul_mat ops, but not too much to avoid interfering
|
3406
4102
|
// with the BLAS calls. need a better solution
|
3407
|
-
if (
|
4103
|
+
if (n_tokens >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas()) {
|
3408
4104
|
n_threads = std::min(4, n_threads);
|
3409
4105
|
}
|
3410
4106
|
|
4107
|
+
// If all tensors can be run on the GPU then using more than 1 thread is detrimental.
|
4108
|
+
const bool full_offload_supported = model.arch == LLM_ARCH_LLAMA ||
|
4109
|
+
model.arch == LLM_ARCH_BAICHUAN ||
|
4110
|
+
model.arch == LLM_ARCH_FALCON;
|
4111
|
+
const bool fully_offloaded = model.n_gpu_layers >= (int) hparams.n_layer + 3;
|
4112
|
+
if (ggml_cpu_has_cublas() && full_offload_supported && fully_offloaded) {
|
4113
|
+
n_threads = 1;
|
4114
|
+
}
|
4115
|
+
|
3411
4116
|
struct ggml_tensor * res = gf->nodes[gf->n_nodes - 1];
|
3412
4117
|
struct ggml_tensor * embeddings = gf->nodes[gf->n_nodes - 2];
|
3413
4118
|
|
@@ -3423,10 +4128,6 @@ static bool llama_eval_internal(
|
|
3423
4128
|
if (lctx.ctx_metal) {
|
3424
4129
|
ggml_metal_set_n_cb (lctx.ctx_metal, n_threads);
|
3425
4130
|
ggml_metal_graph_compute(lctx.ctx_metal, gf);
|
3426
|
-
ggml_metal_get_tensor (lctx.ctx_metal, res);
|
3427
|
-
if (!lctx.embedding.empty()) {
|
3428
|
-
ggml_metal_get_tensor(lctx.ctx_metal, embeddings);
|
3429
|
-
}
|
3430
4131
|
} else {
|
3431
4132
|
ggml_graph_compute_helper(lctx.work_buffer, gf, n_threads);
|
3432
4133
|
}
|
@@ -3438,12 +4139,9 @@ static bool llama_eval_internal(
|
|
3438
4139
|
ggml_mpi_graph_compute_post(lctx.ctx_mpi, gf, n_layer);
|
3439
4140
|
#endif
|
3440
4141
|
|
3441
|
-
// update kv
|
3442
|
-
lctx.kv_self.
|
3443
|
-
|
3444
|
-
if (cgraph_fname) {
|
3445
|
-
ggml_graph_export(gf, cgraph_fname);
|
3446
|
-
}
|
4142
|
+
// update the kv ring buffer
|
4143
|
+
lctx.kv_self.head += n_tokens;
|
4144
|
+
lctx.kv_self.has_shift = false;
|
3447
4145
|
|
3448
4146
|
#ifdef GGML_PERF
|
3449
4147
|
// print timing information per ggml operation (for debugging purposes)
|
@@ -3460,13 +4158,20 @@ static bool llama_eval_internal(
|
|
3460
4158
|
{
|
3461
4159
|
auto & logits_out = lctx.logits;
|
3462
4160
|
|
3463
|
-
if (
|
3464
|
-
logits_out.resize(n_vocab *
|
3465
|
-
|
4161
|
+
if (batch.logits) {
|
4162
|
+
logits_out.resize(n_vocab * n_tokens);
|
4163
|
+
for (uint32_t i = 0; i < n_tokens; i++) {
|
4164
|
+
if (batch.logits[i] == 0) {
|
4165
|
+
continue;
|
4166
|
+
}
|
4167
|
+
memcpy(logits_out.data() + (n_vocab*i), (float *) ggml_get_data(res) + (n_vocab*i), sizeof(float)*n_vocab);
|
4168
|
+
}
|
4169
|
+
} else if (lctx.logits_all) {
|
4170
|
+
logits_out.resize(n_vocab * n_tokens);
|
4171
|
+
memcpy(logits_out.data(), (float *) ggml_get_data(res), sizeof(float)*n_vocab*n_tokens);
|
3466
4172
|
} else {
|
3467
|
-
// return result for just the last token
|
3468
4173
|
logits_out.resize(n_vocab);
|
3469
|
-
memcpy(logits_out.data(), (float *) ggml_get_data(res) + (n_vocab*(
|
4174
|
+
memcpy(logits_out.data(), (float *) ggml_get_data(res) + (n_vocab*(n_tokens - 1)), sizeof(float)*n_vocab);
|
3470
4175
|
}
|
3471
4176
|
}
|
3472
4177
|
|
@@ -3475,20 +4180,27 @@ static bool llama_eval_internal(
|
|
3475
4180
|
auto & embedding_out = lctx.embedding;
|
3476
4181
|
|
3477
4182
|
embedding_out.resize(n_embd);
|
3478
|
-
memcpy(embedding_out.data(), (float *) ggml_get_data(embeddings) + (n_embd*(
|
4183
|
+
memcpy(embedding_out.data(), (float *) ggml_get_data(embeddings) + (n_embd*(n_tokens - 1)), sizeof(float)*n_embd);
|
3479
4184
|
}
|
3480
4185
|
|
3481
4186
|
// measure the performance only for the single-token evals
|
3482
|
-
if (
|
4187
|
+
if (n_tokens == 1) {
|
3483
4188
|
lctx.t_eval_us += ggml_time_us() - t_start_us;
|
3484
4189
|
lctx.n_eval++;
|
3485
4190
|
}
|
3486
|
-
else if (
|
4191
|
+
else if (n_tokens > 1) {
|
3487
4192
|
lctx.t_p_eval_us += ggml_time_us() - t_start_us;
|
3488
|
-
lctx.n_p_eval +=
|
4193
|
+
lctx.n_p_eval += n_tokens;
|
3489
4194
|
}
|
3490
4195
|
|
3491
|
-
|
4196
|
+
// get a more accurate load time, upon first eval
|
4197
|
+
// TODO: fix this
|
4198
|
+
if (!lctx.has_evaluated_once) {
|
4199
|
+
lctx.t_load_us = ggml_time_us() - lctx.t_start_us;
|
4200
|
+
lctx.has_evaluated_once = true;
|
4201
|
+
}
|
4202
|
+
|
4203
|
+
return 0;
|
3492
4204
|
}
|
3493
4205
|
|
3494
4206
|
//
|
@@ -3909,7 +4621,7 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
|
|
3909
4621
|
llm_tokenizer_bpe tokenizer(vocab);
|
3910
4622
|
tokenizer.tokenize(raw_text, output);
|
3911
4623
|
} break;
|
3912
|
-
}
|
4624
|
+
}
|
3913
4625
|
|
3914
4626
|
return output;
|
3915
4627
|
}
|
@@ -3939,7 +4651,7 @@ struct llama_grammar_candidate {
|
|
3939
4651
|
|
3940
4652
|
// Decodes a UTF-8 string which may end in an incomplete sequence. Adds a terminating 0 for use as
|
3941
4653
|
// pointer. If an invalid sequence is encountered, returns `llama_partial_utf8.n_remain == -1`.
|
3942
|
-
std::pair<std::vector<uint32_t>, llama_partial_utf8> decode_utf8(
|
4654
|
+
static std::pair<std::vector<uint32_t>, llama_partial_utf8> decode_utf8(
|
3943
4655
|
const char * src,
|
3944
4656
|
llama_partial_utf8 partial_start) {
|
3945
4657
|
static const int lookup[] = { 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 2, 2, 3, 4 };
|
@@ -4313,6 +5025,13 @@ struct llama_grammar * llama_grammar_copy(const struct llama_grammar * grammar)
|
|
4313
5025
|
// sampling
|
4314
5026
|
//
|
4315
5027
|
|
5028
|
+
void llama_set_rng_seed(struct llama_context * ctx, uint32_t seed) {
|
5029
|
+
if (seed == LLAMA_DEFAULT_SEED) {
|
5030
|
+
seed = time(NULL);
|
5031
|
+
}
|
5032
|
+
ctx->rng.seed(seed);
|
5033
|
+
}
|
5034
|
+
|
4316
5035
|
void llama_sample_softmax(struct llama_context * ctx, llama_token_data_array * candidates) {
|
4317
5036
|
GGML_ASSERT(candidates->size > 0);
|
4318
5037
|
|
@@ -4521,7 +5240,7 @@ void llama_sample_typical(struct llama_context * ctx, llama_token_data_array * c
|
|
4521
5240
|
}
|
4522
5241
|
}
|
4523
5242
|
|
4524
|
-
void
|
5243
|
+
void llama_sample_temp(struct llama_context * ctx, llama_token_data_array * candidates_p, float temp) {
|
4525
5244
|
const int64_t t_start_sample_us = ggml_time_us();
|
4526
5245
|
|
4527
5246
|
for (size_t i = 0; i < candidates_p->size; ++i) {
|
@@ -4533,6 +5252,10 @@ void llama_sample_temperature(struct llama_context * ctx, llama_token_data_array
|
|
4533
5252
|
}
|
4534
5253
|
}
|
4535
5254
|
|
5255
|
+
void llama_sample_temperature(struct llama_context * ctx, llama_token_data_array * candidates_p, float temp) {
|
5256
|
+
llama_sample_temp(ctx, candidates_p, temp);
|
5257
|
+
}
|
5258
|
+
|
4536
5259
|
void llama_sample_repetition_penalty(struct llama_context * ctx, llama_token_data_array * candidates, const llama_token * last_tokens, size_t last_tokens_size, float penalty) {
|
4537
5260
|
if (last_tokens_size == 0 || penalty == 1.0f) {
|
4538
5261
|
return;
|
@@ -4656,7 +5379,7 @@ void llama_sample_classifier_free_guidance(
|
|
4656
5379
|
|
4657
5380
|
GGML_ASSERT(ctx);
|
4658
5381
|
|
4659
|
-
auto n_vocab = llama_n_vocab(ctx);
|
5382
|
+
auto n_vocab = llama_n_vocab(llama_get_model(ctx));
|
4660
5383
|
|
4661
5384
|
GGML_ASSERT(n_vocab == (int)candidates->size);
|
4662
5385
|
GGML_ASSERT(!candidates->sorted);
|
@@ -4685,7 +5408,7 @@ void llama_sample_classifier_free_guidance(
|
|
4685
5408
|
llama_token llama_sample_token_mirostat(struct llama_context * ctx, llama_token_data_array * candidates, float tau, float eta, int m, float * mu) {
|
4686
5409
|
GGML_ASSERT(ctx);
|
4687
5410
|
|
4688
|
-
auto N = float(llama_n_vocab(ctx));
|
5411
|
+
auto N = float(llama_n_vocab(llama_get_model(ctx)));
|
4689
5412
|
int64_t t_start_sample_us;
|
4690
5413
|
t_start_sample_us = ggml_time_us();
|
4691
5414
|
|
@@ -4872,7 +5595,7 @@ struct llama_logit_info {
|
|
4872
5595
|
};
|
4873
5596
|
llama_logit_info(llama_context * ctx)
|
4874
5597
|
: logits(llama_get_logits(ctx))
|
4875
|
-
, n_vocab(llama_n_vocab(ctx))
|
5598
|
+
, n_vocab(llama_n_vocab(llama_get_model(ctx)))
|
4876
5599
|
, max_l(*std::max_element(logits, logits + n_vocab))
|
4877
5600
|
, normalizer(1.0f / std::accumulate(logits, logits + n_vocab, 0.0f, sum_exp{max_l}))
|
4878
5601
|
{ }
|
@@ -4910,7 +5633,6 @@ struct llama_beam_search_data {
|
|
4910
5633
|
size_t n_beams;
|
4911
5634
|
int n_past;
|
4912
5635
|
int n_predict;
|
4913
|
-
int n_threads;
|
4914
5636
|
std::vector<llama_beam> beams;
|
4915
5637
|
std::vector<llama_beam> next_beams;
|
4916
5638
|
|
@@ -4920,12 +5642,11 @@ struct llama_beam_search_data {
|
|
4920
5642
|
// Used to communicate to/from callback on beams state.
|
4921
5643
|
std::vector<llama_beam_view> beam_views;
|
4922
5644
|
|
4923
|
-
llama_beam_search_data(llama_context * ctx, size_t n_beams, int n_past, int n_predict
|
5645
|
+
llama_beam_search_data(llama_context * ctx, size_t n_beams, int n_past, int n_predict)
|
4924
5646
|
: ctx(ctx)
|
4925
5647
|
, n_beams(n_beams)
|
4926
5648
|
, n_past(n_past)
|
4927
5649
|
, n_predict(n_predict)
|
4928
|
-
, n_threads(n_threads)
|
4929
5650
|
, beam_views(n_beams) {
|
4930
5651
|
beams.reserve(n_beams);
|
4931
5652
|
next_beams.reserve(n_beams);
|
@@ -4962,7 +5683,7 @@ struct llama_beam_search_data {
|
|
4962
5683
|
} else {
|
4963
5684
|
// beam is not at end-of-sentence, so branch with next top_k tokens.
|
4964
5685
|
if (!beam.tokens.empty()) {
|
4965
|
-
|
5686
|
+
llama_decode(ctx, llama_batch_get_one(beam.tokens.data(), beam.tokens.size(), n_past, 0));
|
4966
5687
|
}
|
4967
5688
|
llama_logit_info logit_info(ctx);
|
4968
5689
|
std::vector<llama_token_data> next_tokens = logit_info.top_k(n_beams);
|
@@ -5036,7 +5757,7 @@ struct llama_beam_search_data {
|
|
5036
5757
|
callback(callback_data, get_beams_state(false)); // Sets common_prefix_length
|
5037
5758
|
update_beams_from_beam_views(); // Update values (p,eob) that callback may have changed.
|
5038
5759
|
if (common_prefix_length) {
|
5039
|
-
|
5760
|
+
llama_decode(ctx, llama_batch_get_one(beams[0].tokens.data(), common_prefix_length, n_past, 0));
|
5040
5761
|
n_past += common_prefix_length;
|
5041
5762
|
}
|
5042
5763
|
// Zero-out next_beam probabilities to place them last in following min-heap.
|
@@ -5077,11 +5798,11 @@ struct llama_beam_search_data {
|
|
5077
5798
|
|
5078
5799
|
void llama_beam_search(llama_context * ctx,
|
5079
5800
|
llama_beam_search_callback_fn_t callback, void * callback_data,
|
5080
|
-
size_t n_beams, int n_past, int n_predict
|
5801
|
+
size_t n_beams, int n_past, int n_predict) {
|
5081
5802
|
assert(ctx);
|
5082
5803
|
const int64_t t_start_sample_us = ggml_time_us();
|
5083
5804
|
|
5084
|
-
llama_beam_search_data beam_search_data(ctx, n_beams, n_past, n_predict
|
5805
|
+
llama_beam_search_data beam_search_data(ctx, n_beams, n_past, n_predict);
|
5085
5806
|
|
5086
5807
|
beam_search_data.loop(callback, callback_data);
|
5087
5808
|
|
@@ -5301,11 +6022,11 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
5301
6022
|
nthread = std::thread::hardware_concurrency();
|
5302
6023
|
}
|
5303
6024
|
|
5304
|
-
|
6025
|
+
llama_model_loader ml(fname_inp, /*use_mmap*/ false);
|
5305
6026
|
|
5306
6027
|
llama_model model;
|
5307
|
-
llm_load_arch(
|
5308
|
-
llm_load_hparams(
|
6028
|
+
llm_load_arch(ml, model);
|
6029
|
+
llm_load_hparams(ml, model);
|
5309
6030
|
|
5310
6031
|
if (params->only_copy) {
|
5311
6032
|
ftype = model.ftype;
|
@@ -5315,7 +6036,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
5315
6036
|
struct gguf_context * ctx_out = gguf_init_empty();
|
5316
6037
|
|
5317
6038
|
// copy the KV pairs from the input file
|
5318
|
-
gguf_set_kv (ctx_out, ml
|
6039
|
+
gguf_set_kv (ctx_out, ml.ctx_gguf);
|
5319
6040
|
gguf_set_val_u32(ctx_out, "general.quantization_version", GGML_QNT_VERSION);
|
5320
6041
|
gguf_set_val_u32(ctx_out, "general.file_type", ftype);
|
5321
6042
|
|
@@ -5323,8 +6044,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
5323
6044
|
int n_attention_wv = 0;
|
5324
6045
|
int n_feed_forward_w2 = 0;
|
5325
6046
|
|
5326
|
-
for (int i = 0; i < ml
|
5327
|
-
struct ggml_tensor * meta = ml
|
6047
|
+
for (int i = 0; i < ml.n_tensors; ++i) {
|
6048
|
+
struct ggml_tensor * meta = ml.get_tensor_meta(i);
|
5328
6049
|
|
5329
6050
|
const std::string name = ggml_get_name(meta);
|
5330
6051
|
|
@@ -5360,8 +6081,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
5360
6081
|
std::vector<no_init<float>> f32_conv_buf;
|
5361
6082
|
|
5362
6083
|
// populate the original tensors so we get an initial meta data
|
5363
|
-
for (int i = 0; i < ml
|
5364
|
-
struct ggml_tensor * meta = ml
|
6084
|
+
for (int i = 0; i < ml.n_tensors; ++i) {
|
6085
|
+
struct ggml_tensor * meta = ml.get_tensor_meta(i);
|
5365
6086
|
gguf_add_tensor(ctx_out, meta);
|
5366
6087
|
}
|
5367
6088
|
|
@@ -5374,8 +6095,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
5374
6095
|
// placeholder for the meta data
|
5375
6096
|
::zeros(fout, meta_size);
|
5376
6097
|
|
5377
|
-
for (int i = 0; i < ml
|
5378
|
-
struct ggml_tensor * tensor = ml
|
6098
|
+
for (int i = 0; i < ml.n_tensors; ++i) {
|
6099
|
+
struct ggml_tensor * tensor = ml.get_tensor_meta(i);
|
5379
6100
|
|
5380
6101
|
const std::string name = ggml_get_name(tensor);
|
5381
6102
|
|
@@ -5383,10 +6104,10 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
5383
6104
|
read_data.resize(ggml_nbytes(tensor));
|
5384
6105
|
}
|
5385
6106
|
tensor->data = read_data.data();
|
5386
|
-
ml
|
6107
|
+
ml.load_data_for(tensor);
|
5387
6108
|
|
5388
6109
|
LLAMA_LOG_INFO("[%4d/%4d] %36s - [%s], type = %6s, ",
|
5389
|
-
++idx, ml
|
6110
|
+
++idx, ml.n_tensors,
|
5390
6111
|
ggml_get_name(tensor),
|
5391
6112
|
llama_format_tensor_shape(tensor).c_str(),
|
5392
6113
|
ggml_type_name(tensor->type));
|
@@ -5536,8 +6257,9 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
5536
6257
|
}
|
5537
6258
|
}
|
5538
6259
|
|
5539
|
-
|
5540
|
-
|
6260
|
+
static int llama_apply_lora_from_file_internal(
|
6261
|
+
const struct llama_model & model, const char * path_lora, float scale, const char * path_base_model, int n_threads
|
6262
|
+
) {
|
5541
6263
|
LLAMA_LOG_INFO("%s: applying lora adapter from '%s' - please wait ...\n", __func__, path_lora);
|
5542
6264
|
|
5543
6265
|
const int64_t t_start_lora_us = ggml_time_us();
|
@@ -5565,7 +6287,7 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const
|
|
5565
6287
|
int32_t lora_alpha;
|
5566
6288
|
fin.read((char *) &lora_r, sizeof(lora_r));
|
5567
6289
|
fin.read((char *) &lora_alpha, sizeof(lora_alpha));
|
5568
|
-
float scaling = (float)lora_alpha / (float)lora_r;
|
6290
|
+
float scaling = scale * (float)lora_alpha / (float)lora_r;
|
5569
6291
|
|
5570
6292
|
LLAMA_LOG_INFO("%s: r = %d, alpha = %d, scaling = %.2f\n", __func__, lora_r, lora_alpha, scaling);
|
5571
6293
|
|
@@ -5781,9 +6503,10 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const
|
|
5781
6503
|
ggml_set_name(r, "r_cpy");
|
5782
6504
|
}
|
5783
6505
|
|
5784
|
-
struct ggml_cgraph gf =
|
6506
|
+
struct ggml_cgraph * gf = ggml_new_graph(lora_ctx);
|
6507
|
+
ggml_build_forward_expand(gf, r);
|
5785
6508
|
|
5786
|
-
ggml_graph_compute_helper(work_buffer,
|
6509
|
+
ggml_graph_compute_helper(work_buffer, gf, n_threads);
|
5787
6510
|
|
5788
6511
|
// we won't need these tensors again, reset the context to save memory
|
5789
6512
|
ggml_free(lora_ctx);
|
@@ -5812,27 +6535,16 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const
|
|
5812
6535
|
//
|
5813
6536
|
// interface implementation
|
5814
6537
|
//
|
5815
|
-
|
5816
|
-
struct
|
5817
|
-
struct llama_context_params result = {
|
5818
|
-
/*.seed =*/ LLAMA_DEFAULT_SEED,
|
5819
|
-
/*.n_ctx =*/ 512,
|
5820
|
-
/*.n_batch =*/ 512,
|
6538
|
+
struct llama_model_params llama_model_default_params() {
|
6539
|
+
struct llama_model_params result = {
|
5821
6540
|
/*.n_gpu_layers =*/ 0,
|
5822
6541
|
/*.main_gpu =*/ 0,
|
5823
6542
|
/*.tensor_split =*/ nullptr,
|
5824
|
-
/*.rope_freq_base =*/ 10000.0f,
|
5825
|
-
/*.rope_freq_scale =*/ 1.0f,
|
5826
6543
|
/*.progress_callback =*/ nullptr,
|
5827
6544
|
/*.progress_callback_user_data =*/ nullptr,
|
5828
|
-
/*.low_vram =*/ false,
|
5829
|
-
/*.mul_mat_q =*/ true,
|
5830
|
-
/*.f16_kv =*/ true,
|
5831
|
-
/*.logits_all =*/ false,
|
5832
6545
|
/*.vocab_only =*/ false,
|
5833
6546
|
/*.use_mmap =*/ true,
|
5834
6547
|
/*.use_mlock =*/ false,
|
5835
|
-
/*.embedding =*/ false,
|
5836
6548
|
};
|
5837
6549
|
|
5838
6550
|
#ifdef GGML_USE_METAL
|
@@ -5842,6 +6554,24 @@ struct llama_context_params llama_context_default_params() {
|
|
5842
6554
|
return result;
|
5843
6555
|
}
|
5844
6556
|
|
6557
|
+
struct llama_context_params llama_context_default_params() {
|
6558
|
+
struct llama_context_params result = {
|
6559
|
+
/*.seed =*/ LLAMA_DEFAULT_SEED,
|
6560
|
+
/*.n_ctx =*/ 512,
|
6561
|
+
/*.n_batch =*/ 512,
|
6562
|
+
/*.n_threads =*/ GGML_DEFAULT_N_THREADS, // TODO: better default
|
6563
|
+
/*.n_threads_batch =*/ GGML_DEFAULT_N_THREADS,
|
6564
|
+
/*.rope_freq_base =*/ 0.0f,
|
6565
|
+
/*.rope_freq_scale =*/ 0.0f,
|
6566
|
+
/*.mul_mat_q =*/ true,
|
6567
|
+
/*.f16_kv =*/ true,
|
6568
|
+
/*.logits_all =*/ false,
|
6569
|
+
/*.embedding =*/ false,
|
6570
|
+
};
|
6571
|
+
|
6572
|
+
return result;
|
6573
|
+
}
|
6574
|
+
|
5845
6575
|
struct llama_model_quantize_params llama_model_quantize_default_params() {
|
5846
6576
|
struct llama_model_quantize_params result = {
|
5847
6577
|
/*.nthread =*/ 0,
|
@@ -5897,13 +6627,11 @@ int64_t llama_time_us(void) {
|
|
5897
6627
|
|
5898
6628
|
struct llama_model * llama_load_model_from_file(
|
5899
6629
|
const char * path_model,
|
5900
|
-
|
6630
|
+
struct llama_model_params params) {
|
5901
6631
|
ggml_time_init();
|
5902
6632
|
|
5903
6633
|
llama_model * model = new llama_model;
|
5904
6634
|
|
5905
|
-
ggml_type memory_type = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32;
|
5906
|
-
|
5907
6635
|
unsigned cur_percentage = 0;
|
5908
6636
|
if (params.progress_callback == NULL) {
|
5909
6637
|
params.progress_callback_user_data = &cur_percentage;
|
@@ -5920,9 +6648,9 @@ struct llama_model * llama_load_model_from_file(
|
|
5920
6648
|
};
|
5921
6649
|
}
|
5922
6650
|
|
5923
|
-
if (!llama_model_load(path_model, *model, params.
|
5924
|
-
params.main_gpu, params.tensor_split,
|
5925
|
-
params.
|
6651
|
+
if (!llama_model_load(path_model, *model, params.n_gpu_layers,
|
6652
|
+
params.main_gpu, params.tensor_split,
|
6653
|
+
params.use_mmap, params.use_mlock, params.vocab_only,
|
5926
6654
|
params.progress_callback, params.progress_callback_user_data)) {
|
5927
6655
|
LLAMA_LOG_ERROR("%s: failed to load model\n", __func__);
|
5928
6656
|
delete model;
|
@@ -5946,18 +6674,33 @@ struct llama_context * llama_new_context_with_model(
|
|
5946
6674
|
|
5947
6675
|
llama_context * ctx = new llama_context(*model);
|
5948
6676
|
|
6677
|
+
const auto & hparams = model->hparams;
|
6678
|
+
auto & cparams = ctx->cparams;
|
6679
|
+
|
6680
|
+
cparams.n_batch = params.n_batch;
|
6681
|
+
cparams.n_ctx = params.n_ctx == 0 ? hparams.n_ctx_train : params.n_ctx;
|
6682
|
+
cparams.rope_freq_base = params.rope_freq_base == 0 ? hparams.rope_freq_base_train : params.rope_freq_base;
|
6683
|
+
cparams.rope_freq_scale = params.rope_freq_scale == 0 ? hparams.rope_freq_scale_train : params.rope_freq_scale;
|
6684
|
+
cparams.n_threads = params.n_threads;
|
6685
|
+
cparams.n_threads_batch = params.n_threads_batch;
|
6686
|
+
cparams.mul_mat_q = params.mul_mat_q;
|
6687
|
+
|
5949
6688
|
if (params.seed == LLAMA_DEFAULT_SEED) {
|
5950
6689
|
params.seed = time(NULL);
|
5951
6690
|
}
|
5952
6691
|
|
6692
|
+
LLAMA_LOG_INFO("%s: n_ctx = %u\n", __func__, cparams.n_ctx);
|
6693
|
+
LLAMA_LOG_INFO("%s: freq_base = %.1f\n", __func__, cparams.rope_freq_base);
|
6694
|
+
LLAMA_LOG_INFO("%s: freq_scale = %g\n", __func__, cparams.rope_freq_scale);
|
6695
|
+
|
5953
6696
|
ctx->rng = std::mt19937(params.seed);
|
5954
6697
|
ctx->logits_all = params.logits_all;
|
5955
6698
|
|
5956
6699
|
ggml_type memory_type = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32;
|
5957
6700
|
|
5958
6701
|
// reserve memory for context buffers
|
5959
|
-
if (!
|
5960
|
-
if (!llama_kv_cache_init(ctx->model.hparams, ctx->kv_self, memory_type,
|
6702
|
+
if (!hparams.vocab_only) {
|
6703
|
+
if (!llama_kv_cache_init(ctx->model.hparams, ctx->kv_self, memory_type, cparams.n_ctx, model->n_gpu_layers)) {
|
5961
6704
|
LLAMA_LOG_ERROR("%s: llama_kv_cache_init() failed for self-attention cache\n", __func__);
|
5962
6705
|
llama_free(ctx);
|
5963
6706
|
return nullptr;
|
@@ -5968,11 +6711,9 @@ struct llama_context * llama_new_context_with_model(
|
|
5968
6711
|
LLAMA_LOG_INFO("%s: kv self size = %7.2f MB\n", __func__, memory_size / 1024.0 / 1024.0);
|
5969
6712
|
}
|
5970
6713
|
|
5971
|
-
const auto & hparams = ctx->model.hparams;
|
5972
|
-
|
5973
6714
|
// resized during inference
|
5974
6715
|
if (params.logits_all) {
|
5975
|
-
ctx->logits.reserve(
|
6716
|
+
ctx->logits.reserve(cparams.n_ctx*hparams.n_vocab);
|
5976
6717
|
} else {
|
5977
6718
|
ctx->logits.reserve(hparams.n_vocab);
|
5978
6719
|
}
|
@@ -5990,26 +6731,28 @@ struct llama_context * llama_new_context_with_model(
|
|
5990
6731
|
ctx->alloc = ggml_allocr_new_measure(tensor_alignment);
|
5991
6732
|
|
5992
6733
|
// build worst-case graph
|
5993
|
-
int n_tokens = std::min(
|
5994
|
-
int n_past =
|
6734
|
+
int n_tokens = (int)std::min(cparams.n_ctx, cparams.n_batch);
|
6735
|
+
int n_past = cparams.n_ctx - n_tokens;
|
5995
6736
|
llama_token token = llama_token_bos(ctx); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph
|
5996
|
-
ggml_cgraph * gf = llama_build_graph(*ctx, &token,
|
6737
|
+
ggml_cgraph * gf = llama_build_graph(*ctx, llama_batch_get_one(&token, n_tokens, n_past, 0));
|
6738
|
+
|
5997
6739
|
#ifdef GGML_USE_METAL
|
5998
|
-
if (
|
6740
|
+
if (model->n_gpu_layers > 0) {
|
5999
6741
|
ctx->ctx_metal = ggml_metal_init(1);
|
6000
6742
|
if (!ctx->ctx_metal) {
|
6001
6743
|
LLAMA_LOG_ERROR("%s: ggml_metal_init() failed\n", __func__);
|
6002
6744
|
llama_free(ctx);
|
6003
6745
|
return NULL;
|
6004
6746
|
}
|
6005
|
-
|
6006
|
-
|
6747
|
+
ggml_metal_log_set_callback(llama_log_callback_default, NULL);
|
6748
|
+
//ggml_metal_graph_find_concurrency(ctx->ctx_metal, gf, false);
|
6749
|
+
//ggml_allocr_set_parse_seq(ctx->alloc, ggml_metal_get_concur_list(ctx->ctx_metal), ggml_metal_if_optimized(ctx->ctx_metal));
|
6007
6750
|
}
|
6008
6751
|
#endif
|
6009
6752
|
// measure memory requirements for the graph
|
6010
6753
|
size_t alloc_size = ggml_allocr_alloc_graph(ctx->alloc, gf) + tensor_alignment;
|
6011
6754
|
|
6012
|
-
LLAMA_LOG_INFO("%s: compute buffer total size =
|
6755
|
+
LLAMA_LOG_INFO("%s: compute buffer total size = %.2f MB\n", __func__, (ctx->buf_compute.size + alloc_size) / 1024.0 / 1024.0);
|
6013
6756
|
|
6014
6757
|
// recreate allocator with exact memory requirements
|
6015
6758
|
ggml_allocr_free(ctx->alloc);
|
@@ -6018,28 +6761,46 @@ struct llama_context * llama_new_context_with_model(
|
|
6018
6761
|
ctx->alloc = ggml_allocr_new(ctx->buf_alloc.data, ctx->buf_alloc.size, tensor_alignment);
|
6019
6762
|
#ifdef GGML_USE_METAL
|
6020
6763
|
if (ctx->ctx_metal) {
|
6021
|
-
ggml_allocr_set_parse_seq(ctx->alloc, ggml_metal_get_concur_list(ctx->ctx_metal), ggml_metal_if_optimized(ctx->ctx_metal));
|
6764
|
+
//ggml_allocr_set_parse_seq(ctx->alloc, ggml_metal_get_concur_list(ctx->ctx_metal), ggml_metal_if_optimized(ctx->ctx_metal));
|
6022
6765
|
}
|
6023
6766
|
#endif
|
6024
6767
|
#ifdef GGML_USE_CUBLAS
|
6025
|
-
|
6026
|
-
|
6027
|
-
|
6028
|
-
|
6029
|
-
|
6030
|
-
|
6768
|
+
ggml_cuda_set_scratch_size(alloc_size);
|
6769
|
+
LLAMA_LOG_INFO("%s: VRAM scratch buffer: %.2f MB\n", __func__, alloc_size / 1024.0 / 1024.0);
|
6770
|
+
|
6771
|
+
// calculate total VRAM usage
|
6772
|
+
auto add_tensor = [](const ggml_tensor * t, size_t & size) {
|
6773
|
+
if (t->backend == GGML_BACKEND_GPU || t->backend == GGML_BACKEND_GPU_SPLIT) {
|
6774
|
+
size += ggml_nbytes(t);
|
6775
|
+
}
|
6776
|
+
};
|
6777
|
+
size_t model_vram_size = 0;
|
6778
|
+
for (const auto & kv : model->tensors_by_name) {
|
6779
|
+
add_tensor(kv.second, model_vram_size);
|
6031
6780
|
}
|
6781
|
+
|
6782
|
+
size_t kv_vram_size = 0;
|
6783
|
+
add_tensor(ctx->kv_self.k, kv_vram_size);
|
6784
|
+
add_tensor(ctx->kv_self.v, kv_vram_size);
|
6785
|
+
|
6786
|
+
size_t ctx_vram_size = alloc_size + kv_vram_size;
|
6787
|
+
size_t total_vram_size = model_vram_size + ctx_vram_size;
|
6788
|
+
|
6789
|
+
LLAMA_LOG_INFO("%s: total VRAM used: %.2f MB (model: %.2f MB, context: %.2f MB)\n", __func__,
|
6790
|
+
total_vram_size / 1024.0 / 1024.0,
|
6791
|
+
model_vram_size / 1024.0 / 1024.0,
|
6792
|
+
ctx_vram_size / 1024.0 / 1024.0);
|
6032
6793
|
#endif
|
6033
6794
|
}
|
6034
6795
|
|
6035
6796
|
#ifdef GGML_USE_METAL
|
6036
|
-
if (
|
6797
|
+
if (model->n_gpu_layers > 0) {
|
6037
6798
|
// this allocates all Metal resources and memory buffers
|
6038
6799
|
|
6039
6800
|
void * data_ptr = NULL;
|
6040
6801
|
size_t data_size = 0;
|
6041
6802
|
|
6042
|
-
if (
|
6803
|
+
if (ctx->model.mapping) {
|
6043
6804
|
data_ptr = ctx->model.mapping->addr;
|
6044
6805
|
data_size = ctx->model.mapping->size;
|
6045
6806
|
} else {
|
@@ -6058,11 +6819,8 @@ struct llama_context * llama_new_context_with_model(
|
|
6058
6819
|
return NULL; \
|
6059
6820
|
}
|
6060
6821
|
|
6061
|
-
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "data",
|
6062
|
-
|
6063
|
-
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "eval", ctx->buf_compute.data, ctx->buf_compute.size, 0));
|
6064
|
-
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "kv", ctx->kv_self.buf.data, ctx->kv_self.buf.size, 0));
|
6065
|
-
|
6822
|
+
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "data", data_ptr, data_size, max_size));
|
6823
|
+
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "kv", ctx->kv_self.buf.data, ctx->kv_self.buf.size, 0));
|
6066
6824
|
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "alloc", ctx->buf_alloc.data, ctx->buf_alloc.size, 0));
|
6067
6825
|
#undef LLAMA_METAL_CHECK_BUF
|
6068
6826
|
}
|
@@ -6074,8 +6832,10 @@ struct llama_context * llama_new_context_with_model(
|
|
6074
6832
|
|
6075
6833
|
if (ggml_mpi_rank(ctx->ctx_mpi) > 0) {
|
6076
6834
|
// Enter a blocking eval loop with dummy input, letting rank=0 drive the process
|
6077
|
-
|
6078
|
-
|
6835
|
+
// TODO: needs fix after #3228
|
6836
|
+
GGML_ASSERT(false && "not implemented");
|
6837
|
+
//const std::vector<llama_token> tmp(ctx->model.hparams.n_ctx, llama_token_bos(ctx));
|
6838
|
+
//while (!llama_eval(ctx, tmp.data(), tmp.size(), 0, 0)) {};
|
6079
6839
|
llama_backend_free();
|
6080
6840
|
exit(1);
|
6081
6841
|
}
|
@@ -6084,63 +6844,37 @@ struct llama_context * llama_new_context_with_model(
|
|
6084
6844
|
return ctx;
|
6085
6845
|
}
|
6086
6846
|
|
6087
|
-
struct llama_context * llama_init_from_file(
|
6088
|
-
const char * path_model,
|
6089
|
-
struct llama_context_params params) {
|
6090
|
-
struct llama_model * model = llama_load_model_from_file(path_model, params);
|
6091
|
-
if (!model) {
|
6092
|
-
return nullptr;
|
6093
|
-
}
|
6094
|
-
|
6095
|
-
struct llama_context * ctx = llama_new_context_with_model(model, params);
|
6096
|
-
ctx->model_owner = true;
|
6097
|
-
|
6098
|
-
return ctx;
|
6099
|
-
}
|
6100
|
-
|
6101
6847
|
void llama_free(struct llama_context * ctx) {
|
6102
6848
|
delete ctx;
|
6103
6849
|
}
|
6104
6850
|
|
6105
|
-
|
6106
|
-
return
|
6851
|
+
const llama_model * llama_get_model(const struct llama_context * ctx) {
|
6852
|
+
return &ctx->model;
|
6107
6853
|
}
|
6108
6854
|
|
6109
6855
|
int llama_n_ctx(const struct llama_context * ctx) {
|
6110
|
-
return
|
6111
|
-
}
|
6112
|
-
|
6113
|
-
int llama_n_ctx_train(const struct llama_context * ctx) {
|
6114
|
-
return llama_model_n_ctx_train(&ctx->model);
|
6115
|
-
}
|
6116
|
-
|
6117
|
-
int llama_n_embd(const struct llama_context * ctx) {
|
6118
|
-
return llama_model_n_embd(&ctx->model);
|
6856
|
+
return ctx->cparams.n_ctx;
|
6119
6857
|
}
|
6120
6858
|
|
6121
|
-
enum llama_vocab_type llama_vocab_type(const struct
|
6122
|
-
return
|
6859
|
+
enum llama_vocab_type llama_vocab_type(const struct llama_model * model) {
|
6860
|
+
return model->vocab.type;
|
6123
6861
|
}
|
6124
6862
|
|
6125
|
-
int
|
6863
|
+
int llama_n_vocab(const struct llama_model * model) {
|
6126
6864
|
return model->vocab.id_to_token.size();
|
6127
6865
|
}
|
6128
6866
|
|
6129
|
-
int
|
6130
|
-
return model->hparams.n_ctx;
|
6131
|
-
}
|
6132
|
-
|
6133
|
-
int llama_model_n_ctx_train(const struct llama_model * model) {
|
6867
|
+
int llama_n_ctx_train(const struct llama_model * model) {
|
6134
6868
|
return model->hparams.n_ctx_train;
|
6135
6869
|
}
|
6136
6870
|
|
6137
|
-
int
|
6871
|
+
int llama_n_embd(const struct llama_model * model) {
|
6138
6872
|
return model->hparams.n_embd;
|
6139
6873
|
}
|
6140
6874
|
|
6141
6875
|
int llama_model_desc(const struct llama_model * model, char * buf, size_t buf_size) {
|
6142
6876
|
return snprintf(buf, buf_size, "%s %s %s",
|
6143
|
-
model->
|
6877
|
+
llama_model_arch_name(model->arch).c_str(),
|
6144
6878
|
llama_model_type_name(model->type),
|
6145
6879
|
llama_model_ftype_name(model->ftype).c_str());
|
6146
6880
|
}
|
@@ -6161,6 +6895,10 @@ uint64_t llama_model_n_params(const struct llama_model * model) {
|
|
6161
6895
|
return nparams;
|
6162
6896
|
}
|
6163
6897
|
|
6898
|
+
struct ggml_tensor * llama_get_model_tensor(struct llama_model * model, const char * name) {
|
6899
|
+
return ggml_get_tensor(model->ctx, name);
|
6900
|
+
}
|
6901
|
+
|
6164
6902
|
int llama_model_quantize(
|
6165
6903
|
const char * fname_inp,
|
6166
6904
|
const char * fname_out,
|
@@ -6174,18 +6912,18 @@ int llama_model_quantize(
|
|
6174
6912
|
}
|
6175
6913
|
}
|
6176
6914
|
|
6177
|
-
int llama_apply_lora_from_file(struct llama_context * ctx, const char * path_lora, const char * path_base_model, int n_threads) {
|
6915
|
+
int llama_apply_lora_from_file(struct llama_context * ctx, const char * path_lora, float scale, const char * path_base_model, int n_threads) {
|
6178
6916
|
try {
|
6179
|
-
return llama_apply_lora_from_file_internal(ctx->model, path_lora, path_base_model, n_threads);
|
6917
|
+
return llama_apply_lora_from_file_internal(ctx->model, path_lora, scale, path_base_model, n_threads);
|
6180
6918
|
} catch (const std::exception & err) {
|
6181
6919
|
LLAMA_LOG_ERROR("%s: failed to apply lora adapter: %s\n", __func__, err.what());
|
6182
6920
|
return 1;
|
6183
6921
|
}
|
6184
6922
|
}
|
6185
6923
|
|
6186
|
-
int llama_model_apply_lora_from_file(const struct llama_model * model, const char * path_lora, const char * path_base_model, int n_threads) {
|
6924
|
+
int llama_model_apply_lora_from_file(const struct llama_model * model, const char * path_lora, float scale, const char * path_base_model, int n_threads) {
|
6187
6925
|
try {
|
6188
|
-
return llama_apply_lora_from_file_internal(*model, path_lora, path_base_model, n_threads);
|
6926
|
+
return llama_apply_lora_from_file_internal(*model, path_lora, scale, path_base_model, n_threads);
|
6189
6927
|
} catch (const std::exception & err) {
|
6190
6928
|
LLAMA_LOG_ERROR("%s: failed to apply lora adapter: %s\n", __func__, err.what());
|
6191
6929
|
return 1;
|
@@ -6193,16 +6931,27 @@ int llama_model_apply_lora_from_file(const struct llama_model * model, const cha
|
|
6193
6931
|
}
|
6194
6932
|
|
6195
6933
|
int llama_get_kv_cache_token_count(const struct llama_context * ctx) {
|
6196
|
-
return ctx->kv_self.
|
6934
|
+
return ctx->kv_self.head;
|
6197
6935
|
}
|
6198
6936
|
|
6199
|
-
|
6937
|
+
void llama_kv_cache_tokens_rm(struct llama_context * ctx, int32_t c0, int32_t c1) {
|
6938
|
+
llama_kv_cache_tokens_rm(ctx->kv_self, c0, c1);
|
6939
|
+
}
|
6200
6940
|
|
6201
|
-
void
|
6202
|
-
|
6203
|
-
|
6204
|
-
|
6205
|
-
|
6941
|
+
void llama_kv_cache_seq_rm(struct llama_context * ctx, llama_seq_id seq_id, llama_pos p0, llama_pos p1) {
|
6942
|
+
llama_kv_cache_seq_rm(ctx->kv_self, seq_id, p0, p1);
|
6943
|
+
}
|
6944
|
+
|
6945
|
+
void llama_kv_cache_seq_cp(struct llama_context * ctx, llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) {
|
6946
|
+
llama_kv_cache_seq_cp(ctx->kv_self, seq_id_src, seq_id_dst, p0, p1);
|
6947
|
+
}
|
6948
|
+
|
6949
|
+
void llama_kv_cache_seq_keep(struct llama_context * ctx, llama_seq_id seq_id) {
|
6950
|
+
llama_kv_cache_seq_keep(ctx->kv_self, seq_id);
|
6951
|
+
}
|
6952
|
+
|
6953
|
+
void llama_kv_cache_seq_shift(struct llama_context * ctx, llama_seq_id seq_id, llama_pos p0, llama_pos p1, llama_pos delta) {
|
6954
|
+
llama_kv_cache_seq_shift(ctx->kv_self, seq_id, p0, p1, delta);
|
6206
6955
|
}
|
6207
6956
|
|
6208
6957
|
// Returns the *maximum* size of the state
|
@@ -6289,7 +7038,17 @@ struct llama_data_file_context : llama_data_context {
|
|
6289
7038
|
* llama_copy_state_data(ctx, &data_ctx);
|
6290
7039
|
*
|
6291
7040
|
*/
|
6292
|
-
void llama_copy_state_data_internal(struct llama_context * ctx, llama_data_context * data_ctx) {
|
7041
|
+
static void llama_copy_state_data_internal(struct llama_context * ctx, llama_data_context * data_ctx) {
|
7042
|
+
// TODO: does not support multi-sequence states
|
7043
|
+
{
|
7044
|
+
const auto & kv_self = ctx->kv_self;
|
7045
|
+
for (uint32_t i = 0; i < kv_self.head; ++i) {
|
7046
|
+
GGML_ASSERT(kv_self.cells[i].pos == (int32_t) i);
|
7047
|
+
GGML_ASSERT(kv_self.cells[i].seq_id.size() == 1);
|
7048
|
+
GGML_ASSERT(kv_self.cells[i].has_seq_id(0));
|
7049
|
+
}
|
7050
|
+
}
|
7051
|
+
|
6293
7052
|
// copy rng
|
6294
7053
|
{
|
6295
7054
|
std::stringstream rng_ss;
|
@@ -6340,12 +7099,14 @@ void llama_copy_state_data_internal(struct llama_context * ctx, llama_data_conte
|
|
6340
7099
|
{
|
6341
7100
|
const auto & kv_self = ctx->kv_self;
|
6342
7101
|
const auto & hparams = ctx->model.hparams;
|
7102
|
+
const auto & cparams = ctx->cparams;
|
7103
|
+
|
6343
7104
|
const int n_layer = hparams.n_layer;
|
6344
7105
|
const int n_embd = hparams.n_embd_gqa();
|
6345
|
-
const int n_ctx =
|
7106
|
+
const int n_ctx = cparams.n_ctx;
|
6346
7107
|
|
6347
7108
|
const size_t kv_size = kv_self.buf.size;
|
6348
|
-
const int kv_ntok =
|
7109
|
+
const int kv_ntok = kv_self.head;
|
6349
7110
|
|
6350
7111
|
data_ctx->write(&kv_size, sizeof(kv_size));
|
6351
7112
|
data_ctx->write(&kv_ntok, sizeof(kv_ntok));
|
@@ -6448,9 +7209,11 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
|
|
6448
7209
|
{
|
6449
7210
|
const auto & kv_self = ctx->kv_self;
|
6450
7211
|
const auto & hparams = ctx->model.hparams;
|
7212
|
+
const auto & cparams = ctx->cparams;
|
7213
|
+
|
6451
7214
|
const int n_layer = hparams.n_layer;
|
6452
7215
|
const int n_embd = hparams.n_embd_gqa();
|
6453
|
-
const int n_ctx =
|
7216
|
+
const int n_ctx = cparams.n_ctx;
|
6454
7217
|
|
6455
7218
|
size_t kv_size;
|
6456
7219
|
int kv_ntok;
|
@@ -6489,7 +7252,8 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
|
|
6489
7252
|
ggml_free(cpy_ctx);
|
6490
7253
|
}
|
6491
7254
|
|
6492
|
-
ctx->kv_self.
|
7255
|
+
ctx->kv_self.head = kv_ntok;
|
7256
|
+
ctx->kv_self.size = kv_size;
|
6493
7257
|
}
|
6494
7258
|
|
6495
7259
|
const size_t nread = inp - src;
|
@@ -6584,64 +7348,102 @@ bool llama_save_session_file(struct llama_context * ctx, const char * path_sessi
|
|
6584
7348
|
|
6585
7349
|
int llama_eval(
|
6586
7350
|
struct llama_context * ctx,
|
6587
|
-
|
6588
|
-
|
6589
|
-
int n_past
|
6590
|
-
|
6591
|
-
if (!llama_eval_internal(*ctx, tokens, nullptr, n_tokens, n_past, n_threads, nullptr)) {
|
6592
|
-
LLAMA_LOG_ERROR("%s: failed to eval\n", __func__);
|
6593
|
-
return 1;
|
6594
|
-
}
|
7351
|
+
llama_token * tokens,
|
7352
|
+
int32_t n_tokens,
|
7353
|
+
int n_past) {
|
7354
|
+
llama_kv_cache_tokens_rm(ctx->kv_self, n_past, -1);
|
6595
7355
|
|
6596
|
-
|
6597
|
-
|
6598
|
-
|
6599
|
-
ctx->t_load_us = ggml_time_us() - ctx->t_start_us;
|
6600
|
-
ctx->has_evaluated_once = true;
|
7356
|
+
const int ret = llama_decode_internal(*ctx, llama_batch_get_one(tokens, n_tokens, n_past, 0));
|
7357
|
+
if (ret < 0) {
|
7358
|
+
LLAMA_LOG_ERROR("%s: failed to decode, ret = %d\n", __func__, ret);
|
6601
7359
|
}
|
6602
7360
|
|
6603
|
-
return
|
7361
|
+
return ret;
|
6604
7362
|
}
|
6605
7363
|
|
6606
7364
|
int llama_eval_embd(
|
6607
7365
|
struct llama_context * ctx,
|
6608
|
-
|
6609
|
-
|
6610
|
-
int n_past
|
6611
|
-
|
6612
|
-
if (!llama_eval_internal(*ctx, nullptr, embd, n_tokens, n_past, n_threads, nullptr)) {
|
6613
|
-
LLAMA_LOG_ERROR("%s: failed to eval\n", __func__);
|
6614
|
-
return 1;
|
6615
|
-
}
|
7366
|
+
float * embd,
|
7367
|
+
int32_t n_tokens,
|
7368
|
+
int n_past) {
|
7369
|
+
llama_kv_cache_tokens_rm(ctx->kv_self, n_past, -1);
|
6616
7370
|
|
6617
|
-
|
6618
|
-
|
6619
|
-
|
6620
|
-
|
6621
|
-
|
7371
|
+
llama_batch batch = { n_tokens, nullptr, embd, nullptr, nullptr, nullptr, n_past, 1, 0, };
|
7372
|
+
|
7373
|
+
const int ret = llama_decode_internal(*ctx, batch);
|
7374
|
+
if (ret < 0) {
|
7375
|
+
LLAMA_LOG_ERROR("%s: failed to decode, ret = %d\n", __func__, ret);
|
6622
7376
|
}
|
6623
7377
|
|
6624
|
-
return
|
7378
|
+
return ret;
|
6625
7379
|
}
|
6626
7380
|
|
6627
|
-
|
6628
|
-
|
6629
|
-
|
7381
|
+
void llama_set_n_threads(struct llama_context * ctx, uint32_t n_threads, uint32_t n_threads_batch) {
|
7382
|
+
ctx->cparams.n_threads = n_threads;
|
7383
|
+
ctx->cparams.n_threads_batch = n_threads_batch;
|
7384
|
+
}
|
7385
|
+
|
7386
|
+
struct llama_batch llama_batch_get_one(
|
7387
|
+
llama_token * tokens,
|
7388
|
+
int32_t n_tokens,
|
7389
|
+
llama_pos pos_0,
|
7390
|
+
llama_seq_id seq_id) {
|
7391
|
+
return {
|
7392
|
+
/*n_tokens =*/ n_tokens,
|
7393
|
+
/*tokens =*/ tokens,
|
7394
|
+
/*embd =*/ nullptr,
|
7395
|
+
/*pos =*/ nullptr,
|
7396
|
+
/*seq_id =*/ nullptr,
|
7397
|
+
/*logits =*/ nullptr,
|
7398
|
+
/*all_pos_0 =*/ pos_0,
|
7399
|
+
/*all_pos_1 =*/ 1,
|
7400
|
+
/*all_seq_id =*/ seq_id,
|
7401
|
+
};
|
7402
|
+
}
|
6630
7403
|
|
6631
|
-
|
7404
|
+
struct llama_batch llama_batch_init(int32_t n_tokens, int32_t embd) {
|
7405
|
+
llama_batch batch = { -1, nullptr, nullptr, nullptr, nullptr, nullptr, 0, 0, 0, };
|
6632
7406
|
|
6633
|
-
if (
|
6634
|
-
|
6635
|
-
|
7407
|
+
if (embd) {
|
7408
|
+
batch.embd = (float *) malloc(sizeof(float) * n_tokens * embd);
|
7409
|
+
} else {
|
7410
|
+
batch.token = (llama_token *) malloc(sizeof(llama_token) * n_tokens);
|
6636
7411
|
}
|
6637
7412
|
|
6638
|
-
|
7413
|
+
batch.pos = (llama_pos *) malloc(sizeof(llama_pos) * n_tokens);
|
7414
|
+
batch.seq_id = (llama_seq_id *) malloc(sizeof(llama_seq_id) * n_tokens);
|
7415
|
+
batch.logits = (int8_t *) malloc(sizeof(int8_t) * n_tokens);
|
7416
|
+
|
7417
|
+
return batch;
|
7418
|
+
}
|
7419
|
+
|
7420
|
+
void llama_batch_free(struct llama_batch batch) {
|
7421
|
+
if (batch.token) free(batch.token);
|
7422
|
+
if (batch.embd) free(batch.embd);
|
7423
|
+
if (batch.pos) free(batch.pos);
|
7424
|
+
if (batch.seq_id) free(batch.seq_id);
|
7425
|
+
if (batch.logits) free(batch.logits);
|
7426
|
+
}
|
7427
|
+
|
7428
|
+
int llama_decode(
|
7429
|
+
struct llama_context * ctx,
|
7430
|
+
struct llama_batch batch) {
|
7431
|
+
const int ret = llama_decode_internal(*ctx, batch);
|
7432
|
+
if (ret < 0) {
|
7433
|
+
LLAMA_LOG_ERROR("%s: failed to decode, ret = %d\n", __func__, ret);
|
7434
|
+
}
|
7435
|
+
|
7436
|
+
return ret;
|
6639
7437
|
}
|
6640
7438
|
|
6641
7439
|
float * llama_get_logits(struct llama_context * ctx) {
|
6642
7440
|
return ctx->logits.data();
|
6643
7441
|
}
|
6644
7442
|
|
7443
|
+
float * llama_get_logits_ith(struct llama_context * ctx, int32_t i) {
|
7444
|
+
return ctx->logits.data() + i*ctx->model.hparams.n_vocab;
|
7445
|
+
}
|
7446
|
+
|
6645
7447
|
float * llama_get_embeddings(struct llama_context * ctx) {
|
6646
7448
|
return ctx->embedding.data();
|
6647
7449
|
}
|
@@ -6671,21 +7473,13 @@ llama_token llama_token_nl(const struct llama_context * ctx) {
|
|
6671
7473
|
}
|
6672
7474
|
|
6673
7475
|
int llama_tokenize(
|
6674
|
-
struct llama_context * ctx,
|
6675
|
-
const char * text,
|
6676
|
-
llama_token * tokens,
|
6677
|
-
int n_max_tokens,
|
6678
|
-
bool add_bos) {
|
6679
|
-
return llama_tokenize_with_model(&ctx->model, text, tokens, n_max_tokens, add_bos);
|
6680
|
-
}
|
6681
|
-
|
6682
|
-
int llama_tokenize_with_model(
|
6683
7476
|
const struct llama_model * model,
|
6684
7477
|
const char * text,
|
7478
|
+
int text_len,
|
6685
7479
|
llama_token * tokens,
|
6686
7480
|
int n_max_tokens,
|
6687
7481
|
bool add_bos) {
|
6688
|
-
auto res = llama_tokenize_internal(model->vocab, text, add_bos);
|
7482
|
+
auto res = llama_tokenize_internal(model->vocab, std::string(text, text_len), add_bos);
|
6689
7483
|
|
6690
7484
|
if (n_max_tokens < (int) res.size()) {
|
6691
7485
|
// LLAMA_LOG_ERROR("%s: too many tokens\n", __func__);
|
@@ -6699,13 +7493,9 @@ int llama_tokenize_with_model(
|
|
6699
7493
|
return res.size();
|
6700
7494
|
}
|
6701
7495
|
|
6702
|
-
int llama_token_to_piece(const struct llama_context * ctx, llama_token token, char * buf, int length) {
|
6703
|
-
return llama_token_to_piece_with_model(&ctx->model, token, buf, length);
|
6704
|
-
}
|
6705
|
-
|
6706
7496
|
// does not write null-terminator to buf
|
6707
|
-
int
|
6708
|
-
if (0 <= token && token <
|
7497
|
+
int llama_token_to_piece(const struct llama_model * model, llama_token token, char * buf, int length) {
|
7498
|
+
if (0 <= token && token < llama_n_vocab(model)) {
|
6709
7499
|
if (llama_is_normal_token(model->vocab, token)) {
|
6710
7500
|
std::string result = model->vocab.id_to_token[token].text;
|
6711
7501
|
if (llama_vocab_get_type(model->vocab) == LLAMA_VOCAB_TYPE_SPM) {
|
@@ -6725,7 +7515,7 @@ int llama_token_to_piece_with_model(const struct llama_model * model, llama_toke
|
|
6725
7515
|
buf[2] = '\x85';
|
6726
7516
|
return 3;
|
6727
7517
|
} else if (llama_is_control_token(model->vocab, token)) {
|
6728
|
-
|
7518
|
+
// do nothing
|
6729
7519
|
} else if (llama_is_byte_token(model->vocab, token)) {
|
6730
7520
|
if (length < 1) {
|
6731
7521
|
return -1;
|
@@ -6827,16 +7617,18 @@ void llama_dump_timing_info_yaml(FILE * stream, const llama_context * ctx) {
|
|
6827
7617
|
}
|
6828
7618
|
|
6829
7619
|
// For internal test use
|
6830
|
-
const std::vector<std::pair<std::string, struct ggml_tensor
|
7620
|
+
const std::vector<std::pair<std::string, struct ggml_tensor *>> & llama_internal_get_tensor_map(
|
7621
|
+
struct llama_context * ctx
|
7622
|
+
) {
|
6831
7623
|
return ctx->model.tensors_by_name;
|
6832
7624
|
}
|
6833
7625
|
|
6834
|
-
void llama_log_set(
|
7626
|
+
void llama_log_set(ggml_log_callback log_callback, void * user_data) {
|
6835
7627
|
g_state.log_callback = log_callback ? log_callback : llama_log_callback_default;
|
6836
7628
|
g_state.log_callback_user_data = user_data;
|
6837
7629
|
}
|
6838
7630
|
|
6839
|
-
static void llama_log_internal_v(
|
7631
|
+
static void llama_log_internal_v(ggml_log_level level, const char * format, va_list args) {
|
6840
7632
|
va_list args_copy;
|
6841
7633
|
va_copy(args_copy, args);
|
6842
7634
|
char buffer[128];
|
@@ -6853,14 +7645,14 @@ static void llama_log_internal_v(llama_log_level level, const char * format, va_
|
|
6853
7645
|
va_end(args_copy);
|
6854
7646
|
}
|
6855
7647
|
|
6856
|
-
static void llama_log_internal(
|
7648
|
+
static void llama_log_internal(ggml_log_level level, const char * format, ...) {
|
6857
7649
|
va_list args;
|
6858
7650
|
va_start(args, format);
|
6859
7651
|
llama_log_internal_v(level, format, args);
|
6860
7652
|
va_end(args);
|
6861
7653
|
}
|
6862
7654
|
|
6863
|
-
static void llama_log_callback_default(
|
7655
|
+
static void llama_log_callback_default(ggml_log_level level, const char * text, void * user_data) {
|
6864
7656
|
(void) level;
|
6865
7657
|
(void) user_data;
|
6866
7658
|
fputs(text, stderr);
|